From dec592809e548a13ad74d4d7a85f99602515fda5 Mon Sep 17 00:00:00 2001 From: Gunay Mert Karadogan Date: Sun, 2 Feb 2020 17:09:34 +0000 Subject: [PATCH 1/2] =?UTF-8?q?Sync=20=E2=80=9C9-regular-expressions?= =?UTF-8?q?=E2=80=9D=20folder=20with=20upstream?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../02-regexp-character-classes/article.md | 203 ++++++++ .../love-html5-classes.svg | 1 + .../02-regexp-methods/article.md | 458 ------------------ .../03-regexp-character-classes/article.md | 269 ---------- .../hello-java-boundaries.svg | 18 - .../love-html5-classes.svg | 18 - .../03-regexp-unicode/article.md | 167 +++++++ .../1-start-end/solution.md | 3 +- .../1-start-end/task.md | 0 .../04-regexp-anchors/article.md | 52 ++ .../article.md | 114 ----- .../05-regexp-multiline-mode/article.md | 87 ++++ .../1-find-time-hh-mm/solution.md | 0 .../1-find-time-hh-mm/task.md | 1 + .../06-regexp-boundary/article.md | 52 ++ .../hello-java-boundaries.svg | 1 + .../article.md | 36 +- .../1-find-range-1/solution.md | 2 +- .../1-find-range-1/task.md | 0 .../2-find-time-2-formats/solution.md | 4 +- .../2-find-time-2-formats/task.md | 4 +- .../article.md | 197 ++++++++ .../3-find-html-comments/solution.md | 17 - .../witch_greedy1.svg | 16 - .../witch_greedy2.svg | 16 - .../witch_greedy3.svg | 16 - .../witch_greedy4.svg | 19 - .../witch_greedy5.svg | 19 - .../witch_greedy6.svg | 17 - .../08-regexp-greedy-and-lazy/witch_lazy3.svg | 18 - .../08-regexp-greedy-and-lazy/witch_lazy4.svg | 18 - .../08-regexp-greedy-and-lazy/witch_lazy5.svg | 16 - .../08-regexp-greedy-and-lazy/witch_lazy6.svg | 18 - .../1-find-webcolor-3-or-6/solution.md | 29 -- .../solution.md | 18 - .../3-find-decimal-positive-numbers/task.md | 12 - .../09-regexp-groups/article.md | 237 --------- .../09-regexp-groups/regexp-nested-groups.svg | 48 -- .../1-find-text-manydots/solution.md | 4 +- .../1-find-text-manydots/task.md | 4 +- .../2-find-html-colors-6hex/solution.md | 8 +- .../2-find-html-colors-6hex/task.md | 6 +- .../article.md | 64 +-- .../10-regexp-backreferences/article.md | 65 --- .../1-lazy-greedy/solution.md | 0 .../1-lazy-greedy/task.md | 0 .../3-find-html-comments/solution.md | 15 + .../3-find-html-comments/task.md | 4 +- .../4-find-html-tags-greedy-lazy/solution.md | 4 +- .../4-find-html-tags-greedy-lazy/task.md | 4 +- .../article.md | 81 ++-- .../witch_greedy1.svg | 1 + .../witch_greedy2.svg | 1 + .../witch_greedy3.svg | 1 + .../witch_greedy4.svg | 1 + .../witch_greedy5.svg | 1 + .../witch_greedy6.svg | 1 + .../10-regexp-greedy-and-lazy/witch_lazy3.svg | 1 + .../10-regexp-greedy-and-lazy/witch_lazy4.svg | 1 + .../10-regexp-greedy-and-lazy/witch_lazy5.svg | 1 + .../10-regexp-greedy-and-lazy/witch_lazy6.svg | 1 + .../02-find-matching-bbtags/solution.md | 23 - .../11-regexp-alternation/article.md | 59 --- .../11-regexp-groups/01-test-mac/solution.md | 21 + .../01-test-mac}/task.md | 10 +- .../02-find-webcolor-3-or-6/solution.md | 27 ++ .../02-find-webcolor-3-or-6}/task.md | 6 +- .../03-find-decimal-numbers}/solution.md | 6 +- .../03-find-decimal-numbers}/task.md | 4 +- .../04-parse-expression}/solution.md | 25 +- .../04-parse-expression}/task.md | 0 .../11-regexp-groups/article.md | 364 ++++++++++++++ .../regexp-nested-groups-matches.svg | 1 + .../regexp-nested-groups-pattern.svg | 1 + .../12-regexp-anchors/2-test-mac/solution.md | 21 - .../12-regexp-anchors/article.md | 55 --- .../12-regexp-backreferences/article.md | 72 +++ .../01-find-programming-language/solution.md | 8 +- .../01-find-programming-language/task.md | 4 +- .../02-find-matching-bbtags/solution.md | 23 + .../02-find-matching-bbtags/task.md | 10 +- .../03-match-quoted-string/solution.md | 4 +- .../03-match-quoted-string/task.md | 0 .../04-match-exact-tag/solution.md | 4 +- .../04-match-exact-tag/task.md | 4 +- .../13-regexp-alternation/article.md | 70 +++ .../13-regexp-multiline-mode/article.md | 76 --- .../1-find-non-negative-integers/solution.md | 28 ++ .../1-find-non-negative-integers/task.md | 14 + .../2-insert-after-head/solution.md | 36 ++ .../2-insert-after-head/task.md | 30 ++ .../14-regexp-lookahead-lookbehind/article.md | 97 ++-- .../article.md | 301 ++++++++++++ .../article.md | 295 ----------- .../16-regexp-sticky/article.md | 127 +++++ .../17-regexp-methods/article.md | 348 +++++++++++++ .../20-regexp-unicode/article.md | 89 ---- .../21-regexp-unicode-properties/article.md | 86 ---- .../22-regexp-sticky/article.md | 73 --- 99 files changed, 2466 insertions(+), 2446 deletions(-) create mode 100644 9-regular-expressions/02-regexp-character-classes/article.md create mode 100644 9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg delete mode 100644 9-regular-expressions/02-regexp-methods/article.md delete mode 100644 9-regular-expressions/03-regexp-character-classes/article.md delete mode 100644 9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg delete mode 100644 9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg create mode 100644 9-regular-expressions/03-regexp-unicode/article.md rename 9-regular-expressions/{12-regexp-anchors => 04-regexp-anchors}/1-start-end/solution.md (77%) rename 9-regular-expressions/{12-regexp-anchors => 04-regexp-anchors}/1-start-end/task.md (100%) create mode 100644 9-regular-expressions/04-regexp-anchors/article.md delete mode 100644 9-regular-expressions/05-regexp-character-sets-and-ranges/article.md create mode 100644 9-regular-expressions/05-regexp-multiline-mode/article.md rename 9-regular-expressions/{03-regexp-character-classes => 06-regexp-boundary}/1-find-time-hh-mm/solution.md (100%) rename 9-regular-expressions/{03-regexp-character-classes => 06-regexp-boundary}/1-find-time-hh-mm/task.md (99%) create mode 100644 9-regular-expressions/06-regexp-boundary/article.md create mode 100644 9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg rename 9-regular-expressions/{04-regexp-escaping => 07-regexp-escaping}/article.md (55%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/1-find-range-1/solution.md (65%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/1-find-range-1/task.md (100%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/2-find-time-2-formats/solution.md (69%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/2-find-time-2-formats/task.md (76%) create mode 100644 9-regular-expressions/08-regexp-character-sets-and-ranges/article.md delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg delete mode 100644 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg delete mode 100644 9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md delete mode 100644 9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/solution.md delete mode 100644 9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/task.md delete mode 100644 9-regular-expressions/09-regexp-groups/article.md delete mode 100644 9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/1-find-text-manydots/solution.md (57%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/1-find-text-manydots/task.md (59%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/2-find-html-colors-6hex/solution.md (67%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/2-find-html-colors-6hex/task.md (71%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/article.md (63%) delete mode 100644 9-regular-expressions/10-regexp-backreferences/article.md rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/1-lazy-greedy/solution.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/1-lazy-greedy/task.md (100%) create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/3-find-html-comments/task.md (56%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/4-find-html-tags-greedy-lazy/solution.md (51%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/4-find-html-tags-greedy-lazy/task.md (74%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/article.md (73%) create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg create mode 100644 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg delete mode 100644 9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md delete mode 100644 9-regular-expressions/11-regexp-alternation/article.md create mode 100644 9-regular-expressions/11-regexp-groups/01-test-mac/solution.md rename 9-regular-expressions/{12-regexp-anchors/2-test-mac => 11-regexp-groups/01-test-mac}/task.md (50%) create mode 100644 9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md rename 9-regular-expressions/{09-regexp-groups/1-find-webcolor-3-or-6 => 11-regexp-groups/02-find-webcolor-3-or-6}/task.md (59%) rename 9-regular-expressions/{09-regexp-groups/4-find-decimal-numbers => 11-regexp-groups/03-find-decimal-numbers}/solution.md (51%) rename 9-regular-expressions/{09-regexp-groups/4-find-decimal-numbers => 11-regexp-groups/03-find-decimal-numbers}/task.md (71%) rename 9-regular-expressions/{09-regexp-groups/5-parse-expression => 11-regexp-groups/04-parse-expression}/solution.md (50%) rename 9-regular-expressions/{09-regexp-groups/5-parse-expression => 11-regexp-groups/04-parse-expression}/task.md (100%) create mode 100644 9-regular-expressions/11-regexp-groups/article.md create mode 100644 9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg create mode 100644 9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg delete mode 100644 9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md delete mode 100644 9-regular-expressions/12-regexp-anchors/article.md create mode 100644 9-regular-expressions/12-regexp-backreferences/article.md rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/01-find-programming-language/solution.md (79%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/01-find-programming-language/task.md (65%) create mode 100644 9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/02-find-matching-bbtags/task.md (78%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/03-match-quoted-string/solution.md (87%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/03-match-quoted-string/task.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/04-match-exact-tag/solution.md (72%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/04-match-exact-tag/task.md (68%) create mode 100644 9-regular-expressions/13-regexp-alternation/article.md delete mode 100644 9-regular-expressions/13-regexp-multiline-mode/article.md create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/1-find-non-negative-integers/solution.md create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/1-find-non-negative-integers/task.md create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/solution.md create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/task.md create mode 100644 9-regular-expressions/15-regexp-catastrophic-backtracking/article.md delete mode 100644 9-regular-expressions/15-regexp-infinite-backtracking-problem/article.md create mode 100644 9-regular-expressions/16-regexp-sticky/article.md create mode 100644 9-regular-expressions/17-regexp-methods/article.md delete mode 100644 9-regular-expressions/20-regexp-unicode/article.md delete mode 100644 9-regular-expressions/21-regexp-unicode-properties/article.md delete mode 100644 9-regular-expressions/22-regexp-sticky/article.md diff --git a/9-regular-expressions/02-regexp-character-classes/article.md b/9-regular-expressions/02-regexp-character-classes/article.md new file mode 100644 index 000000000..5b4258869 --- /dev/null +++ b/9-regular-expressions/02-regexp-character-classes/article.md @@ -0,0 +1,203 @@ +# Character classes + +Consider a practical task -- we have a phone number like `"+7(903)-123-45-67"`, and we need to turn it into pure numbers: `79035419441`. + +To do so, we can find and remove anything that's not a number. Character classes can help with that. + +A *character class* is a special notation that matches any symbol from a certain set. + +For the start, let's explore the "digit" class. It's written as `pattern:\d` and corresponds to "any single digit". + +For instance, the let's find the first digit in the phone number: + +```js run +let str = "+7(903)-123-45-67"; + +let regexp = /\d/; + +alert( str.match(regexp) ); // 7 +``` + +Without the flag `pattern:g`, the regular expression only looks for the first match, that is the first digit `pattern:\d`. + +Let's add the `pattern:g` flag to find all digits: + +```js run +let str = "+7(903)-123-45-67"; + +let regexp = /\d/g; + +alert( str.match(regexp) ); // array of matches: 7,9,0,3,1,2,3,4,5,6,7 + +// let's make the digits-only phone number of them: +alert( str.match(regexp).join('') ); // 79035419441 +``` + +That was a character class for digits. There are other character classes as well. + +Most used are: + +`pattern:\d` ("d" is from "digit") +: A digit: a character from `0` to `9`. + +`pattern:\s` ("s" is from "space") +: A space symbol: includes spaces, tabs `\t`, newlines `\n` and few other rare characters, such as `\v`, `\f` and `\r`. + +`pattern:\w` ("w" is from "word") +: A "wordly" character: either a letter of Latin alphabet or a digit or an underscore `_`. Non-Latin letters (like cyrillic or hindi) do not belong to `pattern:\w`. + +For instance, `pattern:\d\s\w` means a "digit" followed by a "space character" followed by a "wordly character", such as `match:1 a`. + +**A regexp may contain both regular symbols and character classes.** + +For instance, `pattern:CSS\d` matches a string `match:CSS` with a digit after it: + +```js run +let str = "Is there CSS4?"; +let regexp = /CSS\d/ + +alert( str.match(regexp) ); // CSS4 +``` + +Also we can use many character classes: + +```js run +alert( "I love HTML5!".match(/\s\w\w\w\w\d/) ); // ' HTML5' +``` + +The match (each regexp character class has the corresponding result character): + +![](love-html5-classes.svg) + +## Inverse classes + +For every character class there exists an "inverse class", denoted with the same letter, but uppercased. + +The "inverse" means that it matches all other characters, for instance: + +`pattern:\D` +: Non-digit: any character except `pattern:\d`, for instance a letter. + +`pattern:\S` +: Non-space: any character except `pattern:\s`, for instance a letter. + +`pattern:\W` +: Non-wordly character: anything but `pattern:\w`, e.g a non-latin letter or a space. + +In the beginning of the chapter we saw how to make a number-only phone number from a string like `subject:+7(903)-123-45-67`: find all digits and join them. + +```js run +let str = "+7(903)-123-45-67"; + +alert( str.match(/\d/g).join('') ); // 79031234567 +``` + +An alternative, shorter way is to find non-digits `pattern:\D` and remove them from the string: + +```js run +let str = "+7(903)-123-45-67"; + +alert( str.replace(/\D/g, "") ); // 79031234567 +``` + +## A dot is "any character" + +A dot `pattern:.` is a special character class that matches "any character except a newline". + +For instance: + +```js run +alert( "Z".match(/./) ); // Z +``` + +Or in the middle of a regexp: + +```js run +let regexp = /CS.4/; + +alert( "CSS4".match(regexp) ); // CSS4 +alert( "CS-4".match(regexp) ); // CS-4 +alert( "CS 4".match(regexp) ); // CS 4 (space is also a character) +``` + +Please note that a dot means "any character", but not the "absense of a character". There must be a character to match it: + +```js run +alert( "CS4".match(/CS.4/) ); // null, no match because there's no character for the dot +``` + +### Dot as literally any character with "s" flag + +By default, a dot doesn't match the newline character `\n`. + +For instance, the regexp `pattern:A.B` matches `match:A`, and then `match:B` with any character between them, except a newline `\n`: + +```js run +alert( "A\nB".match(/A.B/) ); // null (no match) +``` + +There are many situations when we'd like a dot to mean literally "any character", newline included. + +That's what flag `pattern:s` does. If a regexp has it, then a dot `pattern:.` matches literally any character: + +```js run +alert( "A\nB".match(/A.B/s) ); // A\nB (match!) +``` + +````warn header="Not supported in Firefox, IE, Edge" +Check for the most recent state of support. At the time of writing it doesn't include Firefox, IE, Edge. + +Luckily, there's an alternative, that works everywhere. We can use a regexp like `pattern:[\s\S]` to match "any character". + +```js run +alert( "A\nB".match(/A[\s\S]B/) ); // A\nB (match!) +``` + +The pattern `pattern:[\s\S]` literally says: "a space character OR not a space character". In other words, "anything". We could use another pair of complementary classes, such as `pattern:[\d\D]`, that doesn't matter. + +This trick works everywhere. Also we can use it if we don't want to set `pattern:s` flag, in cases when we want a regular "no-newline" dot too in the pattern. +```` + +````warn header="Pay attention to spaces" +Usually we pay little attention to spaces. For us strings `subject:1-5` and `subject:1 - 5` are nearly identical. + +But if a regexp doesn't take spaces into account, it may fail to work. + +Let's try to find digits separated by a hyphen: + +```js run +alert( "1 - 5".match(/\d-\d/) ); // null, no match! +``` + +Let's fix it adding spaces into the regexp `pattern:\d - \d`: + +```js run +alert( "1 - 5".match(/\d - \d/) ); // 1 - 5, now it works +// or we can use \s class: +alert( "1 - 5".match(/\d\s-\s\d/) ); // 1 - 5, also works +``` + +**A space is a character. Equal in importance with any other character.** + +We can't add or remove spaces from a regular expression and expect to work the same. + +In other words, in a regular expression all characters matter, spaces too. +```` + +## Summary + +There exist following character classes: + +- `pattern:\d` -- digits. +- `pattern:\D` -- non-digits. +- `pattern:\s` -- space symbols, tabs, newlines. +- `pattern:\S` -- all but `pattern:\s`. +- `pattern:\w` -- Latin letters, digits, underscore `'_'`. +- `pattern:\W` -- all but `pattern:\w`. +- `pattern:.` -- any character if with the regexp `'s'` flag, otherwise any except a newline `\n`. + +...But that's not all! + +Unicode encoding, used by JavaScript for strings, provides many properties for characters, like: which language the letter belongs to (if it's a letter) it is it a punctuation sign, etc. + +We can search by these properties as well. That requires flag `pattern:u`, covered in the next article. diff --git a/9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg b/9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg new file mode 100644 index 000000000..9c88cc088 --- /dev/null +++ b/9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/9-regular-expressions/02-regexp-methods/article.md b/9-regular-expressions/02-regexp-methods/article.md deleted file mode 100644 index f66a9c794..000000000 --- a/9-regular-expressions/02-regexp-methods/article.md +++ /dev/null @@ -1,458 +0,0 @@ -# Methods of RegExp and String - -There are two sets of methods to deal with regular expressions. - -1. First, regular expressions are objects of the built-in [RegExp](mdn:js/RegExp) class, it provides many methods. -2. Besides that, there are methods in regular strings can work with regexps. - - -## Recipes - -Which method to use depends on what we'd like to do. - -Methods become much easier to understand if we separate them by their use in real-life tasks. - -So, here are general recipes, the details to follow: - -**To search for all matches:** - -Use regexp `g` flag and: -- Get a flat array of matches -- `str.match(reg)` -- Get an array or matches with details -- `str.matchAll(reg)`. - -**To search for the first match only:** -- Get the full first match -- `str.match(reg)` (without `g` flag). -- Get the string position of the first match -- `str.search(reg)`. -- Check if there's a match -- `regexp.test(str)`. -- Find the match from the given position -- `regexp.exec(str)` (set `regexp.lastIndex` to position). - -**To replace all matches:** -- Replace with another string or a function result -- `str.replace(reg, str|func)` - -**To split the string by a separator:** -- `str.split(str|reg)` - -Now you can continue reading this chapter to get the details about every method... But if you're reading for the first time, then you probably want to know more about regexps. So you can move to the next chapter, and then return here if something about a method is unclear. - -## str.search(reg) - -We've seen this method already. It returns the position of the first match or `-1` if none found: - -```js run -let str = "A drop of ink may make a million think"; - -alert( str.search( *!*/a/i*/!* ) ); // 0 (first match at zero position) -``` - -**The important limitation: `search` only finds the first match.** - -We can't find next matches using `search`, there's just no syntax for that. But there are other methods that can. - -## str.match(reg), no "g" flag - -The behavior of `str.match` varies depending on whether `reg` has `g` flag or not. - -First, if there's no `g` flag, then `str.match(reg)` looks for the first match only. - -The result is an array with that match and additional properties: - -- `index` -- the position of the match inside the string, -- `input` -- the subject string. - -For instance: - -```js run -let str = "Fame is the thirst of youth"; - -let result = str.match( *!*/fame/i*/!* ); - -alert( result[0] ); // Fame (the match) -alert( result.index ); // 0 (at the zero position) -alert( result.input ); // "Fame is the thirst of youth" (the string) -``` - -A match result may have more than one element. - -**If a part of the pattern is delimited by parentheses `(...)`, then it becomes a separate element in the array.** - -If parentheses have a name, designated by `(?...)` at their start, then `result.groups[name]` has the content. We'll see that later in the chapter [about groups](info:regexp-groups). - -For instance: - -```js run -let str = "JavaScript is a programming language"; - -let result = str.match( *!*/JAVA(SCRIPT)/i*/!* ); - -alert( result[0] ); // JavaScript (the whole match) -alert( result[1] ); // script (the part of the match that corresponds to the parentheses) -alert( result.index ); // 0 -alert( result.input ); // JavaScript is a programming language -``` - -Due to the `i` flag the search is case-insensitive, so it finds `match:JavaScript`. The part of the match that corresponds to `pattern:SCRIPT` becomes a separate array item. - -So, this method is used to find one full match with all details. - - -## str.match(reg) with "g" flag - -When there's a `"g"` flag, then `str.match` returns an array of all matches. There are no additional properties in that array, and parentheses do not create any elements. - -For instance: - -```js run -let str = "HO-Ho-ho!"; - -let result = str.match( *!*/ho/ig*/!* ); - -alert( result ); // HO, Ho, ho (array of 3 matches, case-insensitive) -``` - -Parentheses do not change anything, here we go: - -```js run -let str = "HO-Ho-ho!"; - -let result = str.match( *!*/h(o)/ig*/!* ); - -alert( result ); // HO, Ho, ho -``` - -**So, with `g` flag `str.match` returns a simple array of all matches, without details.** - -If we want to get information about match positions and contents of parentheses then we should use `matchAll` method that we'll cover below. - -````warn header="If there are no matches, `str.match` returns `null`" -Please note, that's important. If there are no matches, the result is not an empty array, but `null`. - -Keep that in mind to evade pitfalls like this: - -```js run -let str = "Hey-hey-hey!"; - -alert( str.match(/Z/g).length ); // Error: Cannot read property 'length' of null -``` - -Here `str.match(/Z/g)` is `null`, it has no `length` property. -```` - -## str.matchAll(regexp) - -The method `str.matchAll(regexp)` is used to find all matches with all details. - -For instance: - -```js run -let str = "Javascript or JavaScript? Should we uppercase 'S'?"; - -let result = str.matchAll( *!*/java(script)/ig*/!* ); - -let [match1, match2] = result; - -alert( match1[0] ); // Javascript (the whole match) -alert( match1[1] ); // script (the part of the match that corresponds to the parentheses) -alert( match1.index ); // 0 -alert( match1.input ); // = str (the whole original string) - -alert( match2[0] ); // JavaScript (the whole match) -alert( match2[1] ); // Script (the part of the match that corresponds to the parentheses) -alert( match2.index ); // 14 -alert( match2.input ); // = str (the whole original string) -``` - -````warn header="`matchAll` returns an iterable, not array" -For instance, if we try to get the first match by index, it won't work: - -```js run -let str = "Javascript or JavaScript??"; - -let result = str.matchAll( /javascript/ig ); - -*!* -alert(result[0]); // undefined (?! there must be a match) -*/!* -``` - -The reason is that the iterator is not an array. We need to run `Array.from(result)` on it, or use `for..of` loop to get matches. - -In practice, if we need all matches, then `for..of` works, so it's not a problem. - -And, to get only few matches, we can use destructuring: - -```js run -let str = "Javascript or JavaScript??"; - -*!* -let [firstMatch] = str.matchAll( /javascript/ig ); -*/!* - -alert(firstMatch); // Javascript -``` -```` - -```warn header="`matchAll` is supernew, may need a polyfill" -The method may not work in old browsers. A polyfill might be needed (this site uses core-js). - -Or you could make a loop with `regexp.exec`, explained below. -``` - -## str.split(regexp|substr, limit) - -Splits the string using the regexp (or a substring) as a delimiter. - -We already used `split` with strings, like this: - -```js run -alert('12-34-56'.split('-')) // array of [12, 34, 56] -``` - -But we can split by a regular expression, the same way: - -```js run -alert('12-34-56'.split(/-/)) // array of [12, 34, 56] -``` - -## str.replace(str|reg, str|func) - -This is a generic method for searching and replacing, one of most useful ones. The swiss army knife for searching and replacing. - -We can use it without regexps, to search and replace a substring: - -```js run -// replace a dash by a colon -alert('12-34-56'.replace("-", ":")) // 12:34-56 -``` - -There's a pitfall though. - -**When the first argument of `replace` is a string, it only looks for the first match.** - -You can see that in the example above: only the first `"-"` is replaced by `":"`. - -To find all dashes, we need to use not the string `"-"`, but a regexp `pattern:/-/g`, with an obligatory `g` flag: - -```js run -// replace all dashes by a colon -alert( '12-34-56'.replace( *!*/-/g*/!*, ":" ) ) // 12:34:56 -``` - -The second argument is a replacement string. We can use special characters in it: - -| Symbol | Inserts | -|--------|--------| -|`$$`|`"$"` | -|`$&`|the whole match| -|$`|a part of the string before the match| -|`$'`|a part of the string after the match| -|`$n`|if `n` is a 1-2 digit number, then it means the contents of n-th parentheses counting from left to right, otherwise it means a parentheses with the given name | - - -For instance if we use `$&` in the replacement string, that means "put the whole match here". - -Let's use it to prepend all entries of `"John"` with `"Mr."`: - -```js run -let str = "John Doe, John Smith and John Bull"; - -// for each John - replace it with Mr. and then John -alert(str.replace(/John/g, 'Mr.$&')); // Mr.John Doe, Mr.John Smith and Mr.John Bull -``` - -Quite often we'd like to reuse parts of the source string, recombine them in the replacement or wrap into something. - -To do so, we should: -1. First, mark the parts by parentheses in regexp. -2. Use `$1`, `$2` (and so on) in the replacement string to get the content matched by 1st, 2nd and so on parentheses. - -For instance: - -```js run -let str = "John Smith"; - -// swap first and last name -alert(str.replace(/(john) (smith)/i, '$2, $1')) // Smith, John -``` - -**For situations that require "smart" replacements, the second argument can be a function.** - -It will be called for each match, and its result will be inserted as a replacement. - -For instance: - -```js run -let i = 0; - -// replace each "ho" by the result of the function -alert("HO-Ho-ho".replace(/ho/gi, function() { - return ++i; -})); // 1-2-3 -``` - -In the example above the function just returns the next number every time, but usually the result is based on the match. - -The function is called with arguments `func(str, p1, p2, ..., pn, offset, input, groups)`: - -1. `str` -- the match, -2. `p1, p2, ..., pn` -- contents of parentheses (if there are any), -3. `offset` -- position of the match, -4. `input` -- the source string, -5. `groups` -- an object with named groups (see chapter [](info:regexp-groups)). - -If there are no parentheses in the regexp, then there are only 3 arguments: `func(str, offset, input)`. - -Let's use it to show full information about matches: - -```js run -// show and replace all matches -function replacer(str, offset, input) { - alert(`Found ${str} at position ${offset} in string ${input}`); - return str.toLowerCase(); -} - -let result = "HO-Ho-ho".replace(/ho/gi, replacer); -alert( 'Result: ' + result ); // Result: ho-ho-ho - -// shows each match: -// Found HO at position 0 in string HO-Ho-ho -// Found Ho at position 3 in string HO-Ho-ho -// Found ho at position 6 in string HO-Ho-ho -``` - -In the example below there are two parentheses, so `replacer` is called with 5 arguments: `str` is the full match, then parentheses, and then `offset` and `input`: - -```js run -function replacer(str, name, surname, offset, input) { - // name is the first parentheses, surname is the second one - return surname + ", " + name; -} - -let str = "John Smith"; - -alert(str.replace(/(John) (Smith)/, replacer)) // Smith, John -``` - -Using a function gives us the ultimate replacement power, because it gets all the information about the match, has access to outer variables and can do everything. - -## regexp.exec(str) - -We've already seen these searching methods: - -- `search` -- looks for the position of the match, -- `match` -- if there's no `g` flag, returns the first match with parentheses and all details, -- `match` -- if there's a `g` flag -- returns all matches, without details parentheses, -- `matchAll` -- returns all matches with details. - -The `regexp.exec` method is the most flexible searching method of all. Unlike previous methods, `exec` should be called on a regexp, rather than on a string. - -It behaves differently depending on whether the regexp has the `g` flag. - -If there's no `g`, then `regexp.exec(str)` returns the first match, exactly as `str.match(reg)`. Such behavior does not give us anything new. - -But if there's `g`, then: -- `regexp.exec(str)` returns the first match and *remembers* the position after it in `regexp.lastIndex` property. -- The next call starts to search from `regexp.lastIndex` and returns the next match. -- If there are no more matches then `regexp.exec` returns `null` and `regexp.lastIndex` is set to `0`. - -We could use it to get all matches with their positions and parentheses groups in a loop, instead of `matchAll`: - -```js run -let str = 'A lot about JavaScript at https://javascript.info'; - -let regexp = /javascript/ig; - -let result; - -while (result = regexp.exec(str)) { - alert( `Found ${result[0]} at ${result.index}` ); - // shows: Found JavaScript at 12, then: - // shows: Found javascript at 34 -} -``` - -Surely, `matchAll` does the same, at least for modern browsers. But what `matchAll` can't do -- is to search from a given position. - -Let's search from position `13`. What we need is to assign `regexp.lastIndex=13` and call `regexp.exec`: - -```js run -let str = "A lot about JavaScript at https://javascript.info"; - -let regexp = /javascript/ig; -*!* -regexp.lastIndex = 13; -*/!* - -let result; - -while (result = regexp.exec(str)) { - alert( `Found ${result[0]} at ${result.index}` ); - // shows: Found javascript at 34 -} -``` - -Now, starting from the given position `13`, there's only one match. - - -## regexp.test(str) - -The method `regexp.test(str)` looks for a match and returns `true/false` whether it finds it. - -For instance: - -```js run -let str = "I love JavaScript"; - -// these two tests do the same -alert( *!*/love/i*/!*.test(str) ); // true -alert( str.search(*!*/love/i*/!*) != -1 ); // true -``` - -An example with the negative answer: - -```js run -let str = "Bla-bla-bla"; - -alert( *!*/love/i*/!*.test(str) ); // false -alert( str.search(*!*/love/i*/!*) != -1 ); // false -``` - -If the regexp has `'g'` flag, then `regexp.test` advances `regexp.lastIndex` property, just like `regexp.exec`. - -So we can use it to search from a given position: - -```js run -let regexp = /love/gi; - -let str = "I love JavaScript"; - -// start the search from position 10: -regexp.lastIndex = 10 -alert( regexp.test(str) ); // false (no match) -``` - - - -````warn header="Same global regexp tested repeatedly may fail to match" -If we apply the same global regexp to different inputs, it may lead to wrong result, because `regexp.test` call advances `regexp.lastIndex` property, so the search in another string may start from non-zero position. - -For instance, here we call `regexp.test` twice on the same text, and the second time fails: - -```js run -let regexp = /javascript/g; // (regexp just created: regexp.lastIndex=0) - -alert( regexp.test("javascript") ); // true (regexp.lastIndex=10 now) -alert( regexp.test("javascript") ); // false -``` - -That's exactly because `regexp.lastIndex` is non-zero on the second test. - -To work around that, one could use non-global regexps or re-adjust `regexp.lastIndex=0` before a new search. -```` - -## Summary - -There's a variety of many methods on both regexps and strings. - -Their abilities and methods overlap quite a bit, we can do the same by different calls. Sometimes that may cause confusion when starting to learn the language. - -Then please refer to the recipes at the beginning of this chapter, as they provide solutions for the majority of regexp-related tasks. diff --git a/9-regular-expressions/03-regexp-character-classes/article.md b/9-regular-expressions/03-regexp-character-classes/article.md deleted file mode 100644 index 911622162..000000000 --- a/9-regular-expressions/03-regexp-character-classes/article.md +++ /dev/null @@ -1,269 +0,0 @@ -# Character classes - -Consider a practical task -- we have a phone number `"+7(903)-123-45-67"`, and we need to turn it into pure numbers: `79035419441`. - -To do so, we can find and remove anything that's not a number. Character classes can help with that. - -A character class is a special notation that matches any symbol from a certain set. - -For the start, let's explore a "digit" class. It's written as `\d`. We put it in the pattern, that means "any single digit". - -For instance, the let's find the first digit in the phone number: - -```js run -let str = "+7(903)-123-45-67"; - -let reg = /\d/; - -alert( str.match(reg) ); // 7 -``` - -Without the flag `g`, the regular expression only looks for the first match, that is the first digit `\d`. - -Let's add the `g` flag to find all digits: - -```js run -let str = "+7(903)-123-45-67"; - -let reg = /\d/g; - -alert( str.match(reg) ); // array of matches: 7,9,0,3,1,2,3,4,5,6,7 - -alert( str.match(reg).join('') ); // 79035419441 -``` - -That was a character class for digits. There are other character classes as well. - -Most used are: - -`\d` ("d" is from "digit") -: A digit: a character from `0` to `9`. - -`\s` ("s" is from "space") -: A space symbol: that includes spaces, tabs, newlines. - -`\w` ("w" is from "word") -: A "wordly" character: either a letter of English alphabet or a digit or an underscore. Non-Latin letters (like cyrillic or hindi) do not belong to `\w`. - -For instance, `pattern:\d\s\w` means a "digit" followed by a "space character" followed by a "wordly character", like `"1 a"`. - -**A regexp may contain both regular symbols and character classes.** - -For instance, `pattern:CSS\d` matches a string `match:CSS` with a digit after it: - -```js run -let str = "CSS4 is cool"; -let reg = /CSS\d/ - -alert( str.match(reg) ); // CSS4 -``` - -Also we can use many character classes: - -```js run -alert( "I love HTML5!".match(/\s\w\w\w\w\d/) ); // ' HTML5' -``` - -The match (each character class corresponds to one result character): - -![](love-html5-classes.svg) - -## Word boundary: \b - -A word boundary `pattern:\b` -- is a special character class. - -It does not denote a character, but rather a boundary between characters. - -For instance, `pattern:\bJava\b` matches `match:Java` in the string `subject:Hello, Java!`, but not in the script `subject:Hello, JavaScript!`. - -```js run -alert( "Hello, Java!".match(/\bJava\b/) ); // Java -alert( "Hello, JavaScript!".match(/\bJava\b/) ); // null -``` - -The boundary has "zero width" in a sense that usually a character class means a character in the result (like a wordly character or a digit), but not in this case. - -The boundary is a test. - -When regular expression engine is doing the search, it's moving along the string in an attempt to find the match. At each string position it tries to find the pattern. - -When the pattern contains `pattern:\b`, it tests that the position in string is a word boundary, that is one of three variants: - -- Immediately before is `\w`, and immediately after -- not `\w`, or vise versa. -- At string start, and the first string character is `\w`. -- At string end, and the last string character is `\w`. - -For instance, in the string `subject:Hello, Java!` the following positions match `\b`: - -![](hello-java-boundaries.svg) - -So it matches `pattern:\bHello\b`, because: - -1. At the beginning of the string the first `\b` test matches. -2. Then the word `Hello` matches. -3. Then `\b` matches, as we're between `o` and a space. - -Pattern `pattern:\bJava\b` also matches. But not `pattern:\bHell\b` (because there's no word boundary after `l`) and not `Java!\b` (because the exclamation sign is not a wordly character, so there's no word boundary after it). - - -```js run -alert( "Hello, Java!".match(/\bHello\b/) ); // Hello -alert( "Hello, Java!".match(/\bJava\b/) ); // Java -alert( "Hello, Java!".match(/\bHell\b/) ); // null (no match) -alert( "Hello, Java!".match(/\bJava!\b/) ); // null (no match) -``` - -Once again let's note that `pattern:\b` makes the searching engine to test for the boundary, so that `pattern:Java\b` finds `match:Java` only when followed by a word boundary, but it does not add a letter to the result. - -Usually we use `\b` to find standalone English words. So that if we want `"Java"` language then `pattern:\bJava\b` finds exactly a standalone word and ignores it when it's a part of another word, e.g. it won't match `match:Java` in `subject:JavaScript`. - -Another example: a regexp `pattern:\b\d\d\b` looks for standalone two-digit numbers. In other words, it requires that before and after `pattern:\d\d` must be a symbol different from `\w` (or beginning/end of the string). - -```js run -alert( "1 23 456 78".match(/\b\d\d\b/g) ); // 23,78 -``` - -```warn header="Word boundary doesn't work for non-Latin alphabets" -The word boundary check `\b` tests for a boundary between `\w` and something else. But `\w` means an English letter (or a digit or an underscore), so the test won't work for other characters (like cyrillic or hieroglyphs). - -Later we'll come by Unicode character classes that allow to solve the similar task for different languages. -``` - - -## Inverse classes - -For every character class there exists an "inverse class", denoted with the same letter, but uppercased. - -The "reverse" means that it matches all other characters, for instance: - -`\D` -: Non-digit: any character except `\d`, for instance a letter. - -`\S` -: Non-space: any character except `\s`, for instance a letter. - -`\W` -: Non-wordly character: anything but `\w`. - -`\B` -: Non-boundary: a test reverse to `\b`. - -In the beginning of the chapter we saw how to get all digits from the phone `subject:+7(903)-123-45-67`. - -One way was to match all digits and join them: - -```js run -let str = "+7(903)-123-45-67"; - -alert( str.match(/\d/g).join('') ); // 79031234567 -``` - -An alternative, shorter way is to find non-digits `\D` and remove them from the string: - - -```js run -let str = "+7(903)-123-45-67"; - -alert( str.replace(/\D/g, "") ); // 79031234567 -``` - -## Spaces are regular characters - -Usually we pay little attention to spaces. For us strings `subject:1-5` and `subject:1 - 5` are nearly identical. - -But if a regexp doesn't take spaces into account, it may fail to work. - -Let's try to find digits separated by a dash: - -```js run -alert( "1 - 5".match(/\d-\d/) ); // null, no match! -``` - -Here we fix it by adding spaces into the regexp `pattern:\d - \d`: - -```js run -alert( "1 - 5".match(/\d - \d/) ); // 1 - 5, now it works -``` - -**A space is a character. Equal in importance with any other character.** - -Of course, spaces in a regexp are needed only if we look for them. Extra spaces (just like any other extra characters) may prevent a match: - -```js run -alert( "1-5".match(/\d - \d/) ); // null, because the string 1-5 has no spaces -``` - -In other words, in a regular expression all characters matter, spaces too. - -## A dot is any character - -The dot `"."` is a special character class that matches "any character except a newline". - -For instance: - -```js run -alert( "Z".match(/./) ); // Z -``` - -Or in the middle of a regexp: - -```js run -let reg = /CS.4/; - -alert( "CSS4".match(reg) ); // CSS4 -alert( "CS-4".match(reg) ); // CS-4 -alert( "CS 4".match(reg) ); // CS 4 (space is also a character) -``` - -Please note that the dot means "any character", but not the "absense of a character". There must be a character to match it: - -```js run -alert( "CS4".match(/CS.4/) ); // null, no match because there's no character for the dot -``` - -### The dotall "s" flag - -Usually a dot doesn't match a newline character. - -For instance, `pattern:A.B` matches `match:A`, and then `match:B` with any character between them, except a newline. - -This doesn't match: - -```js run -alert( "A\nB".match(/A.B/) ); // null (no match) - -// a space character would match, or a letter, but not \n -``` - -Sometimes it's inconvenient, we really want "any character", newline included. - -That's what `s` flag does. If a regexp has it, then the dot `"."` match literally any character: - -```js run -alert( "A\nB".match(/A.B/s) ); // A\nB (match!) -``` - -## Summary - -There exist following character classes: - -- `pattern:\d` -- digits. -- `pattern:\D` -- non-digits. -- `pattern:\s` -- space symbols, tabs, newlines. -- `pattern:\S` -- all but `pattern:\s`. -- `pattern:\w` -- English letters, digits, underscore `'_'`. -- `pattern:\W` -- all but `pattern:\w`. -- `pattern:.` -- any character if with the regexp `'s'` flag, otherwise any except a newline. - -...But that's not all! - -The Unicode encoding, used by JavaScript for strings, provides many properties for characters, like: which language the letter belongs to (if a letter) it is it a punctuation sign, etc. - -Modern JavaScript allows to use these properties in regexps to look for characters, for instance: - -- A cyrillic letter is: `pattern:\p{Script=Cyrillic}` or `pattern:\p{sc=Cyrillic}`. -- A dash (be it a small hyphen `-` or a long dash `—`): `pattern:\p{Dash_Punctuation}` or `pattern:\p{pd}`. -- A currency symbol, such as `$`, `€` or another: `pattern:\p{Currency_Symbol}` or `pattern:\p{sc}`. -- ...And much more. Unicode has a lot of character categories that we can select from. - -These patterns require `'u'` regexp flag to work. More about that in the chapter [](info:regexp-unicode). diff --git a/9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg b/9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg deleted file mode 100644 index 65714ef75..000000000 --- a/9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg +++ /dev/null @@ -1,18 +0,0 @@ - - - - hello-java-boundaries.svg - Created with sketchtool. - - - - Hello, Java - ! - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg b/9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg deleted file mode 100644 index 4b9f4d295..000000000 --- a/9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg +++ /dev/null @@ -1,18 +0,0 @@ - - - - love-html5-classes.svg - Created with sketchtool. - - - - I love HTML - 5 - - - \s \w \w \w \w \ - d - - - - \ No newline at end of file diff --git a/9-regular-expressions/03-regexp-unicode/article.md b/9-regular-expressions/03-regexp-unicode/article.md new file mode 100644 index 000000000..a8a5f4f5b --- /dev/null +++ b/9-regular-expressions/03-regexp-unicode/article.md @@ -0,0 +1,167 @@ +# Unicode: flag "u" and class \p{...} + +JavaScript uses [Unicode encoding](https://en.wikipedia.org/wiki/Unicode) for strings. Most characters are encoding with 2 bytes, but that allows to represent at most 65536 characters. + +That range is not big enough to encode all possible characters, that's why some rare characters are encoded with 4 bytes, for instance like `𝒳` (mathematical X) or `😄` (a smile), some hieroglyphs and so on. + +Here are the unicode values of some characters: + +| Character | Unicode | Bytes count in unicode | +|------------|---------|--------| +| a | `0x0061` | 2 | +| ≈ | `0x2248` | 2 | +|𝒳| `0x1d4b3` | 4 | +|𝒴| `0x1d4b4` | 4 | +|😄| `0x1f604` | 4 | + +So characters like `a` and `≈` occupy 2 bytes, while codes for `𝒳`, `𝒴` and `😄` are longer, they have 4 bytes. + +Long time ago, when JavaScript language was created, Unicode encoding was simpler: there were no 4-byte characters. So, some language features still handle them incorrectly. + +For instance, `length` thinks that here are two characters: + +```js run +alert('😄'.length); // 2 +alert('𝒳'.length); // 2 +``` + +...But we can see that there's only one, right? The point is that `length` treats 4 bytes as two 2-byte characters. That's incorrect, because they must be considered only together (so-called "surrogate pair", you can read about them in the article ). + +By default, regular expressions also treat 4-byte "long characters" as a pair of 2-byte ones. And, as it happens with strings, that may lead to odd results. We'll see that a bit later, in the article . + +Unlike strings, regular expressions have flag `pattern:u` that fixes such problems. With such flag, a regexp handles 4-byte characters correctly. And also Unicode property search becomes available, we'll get to it next. + +## Unicode properties \p{...} + +```warn header="Not supported in Firefox and Edge" +Despite being a part of the standard since 2018, unicode properties are not supported in Firefox ([bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1361876)) and Edge ([bug](https://github.com/Microsoft/ChakraCore/issues/2969)). + +There's [XRegExp](http://xregexp.com) library that provides "extended" regular expressions with cross-browser support for unicode properties. +``` + +Every character in Unicode has a lot of properties. They describe what "category" the character belongs to, contain miscellaneous information about it. + +For instance, if a character has `Letter` property, it means that the character belongs to an alphabet (of any language). And `Number` property means that it's a digit: maybe Arabic or Chinese, and so on. + +We can search for characters with a property, written as `pattern:\p{…}`. To use `pattern:\p{…}`, a regular expression must have flag `pattern:u`. + +For instance, `\p{Letter}` denotes a letter in any of language. We can also use `\p{L}`, as `L` is an alias of `Letter`. There are shorter aliases for almost every property. + +In the example below three kinds of letters will be found: English, Georgean and Korean. + +```js run +let str = "A ბ ㄱ"; + +alert( str.match(/\p{L}/gu) ); // A,ბ,ㄱ +alert( str.match(/\p{L}/g) ); // null (no matches, as there's no flag "u") +``` + +Here's the main character categories and their subcategories: + +- Letter `L`: + - lowercase `Ll` + - modifier `Lm`, + - titlecase `Lt`, + - uppercase `Lu`, + - other `Lo`. +- Number `N`: + - decimal digit `Nd`, + - letter number `Nl`, + - other `No`. +- Punctuation `P`: + - connector `Pc`, + - dash `Pd`, + - initial quote `Pi`, + - final quote `Pf`, + - open `Ps`, + - close `Pe`, + - other `Po`. +- Mark `M` (accents etc): + - spacing combining `Mc`, + - enclosing `Me`, + - non-spacing `Mn`. +- Symbol `S`: + - currency `Sc`, + - modifier `Sk`, + - math `Sm`, + - other `So`. +- Separator `Z`: + - line `Zl`, + - paragraph `Zp`, + - space `Zs`. +- Other `C`: + - control `Cc`, + - format `Cf`, + - not assigned `Cn`, + -- private use `Co`, + - surrogate `Cs`. + + +So, e.g. if we need letters in lower case, we can write `pattern:\p{Ll}`, punctuation signs: `pattern:\p{P}` and so on. + +There are also other derived categories, like: +- `Alphabetic` (`Alpha`), includes Letters `L`, plus letter numbers `Nl` (e.g. Ⅻ - a character for the roman number 12), plus some other symbols `Other_Alphabetic` (`OAlpha`). +- `Hex_Digit` includes hexadecimal digits: `0-9`, `a-f`. +- ...And so on. + +Unicode supports many different properties, their full list would require a lot of space, so here are the references: + +- List all properties by a character: . +- List all characters by a property: . +- Short aliases for properties: . +- A full base of Unicode characters in text format, with all properties, is here: . + +### Example: hexadecimal numbers + +For instance, let's look for hexadecimal numbers, written as `xFF`, where `F` is a hex digit (0..1 or A..F). + +A hex digit can be denoted as `pattern:\p{Hex_Digit}`: + +```js run +let regexp = /x\p{Hex_Digit}\p{Hex_Digit}/u; + +alert("number: xAF".match(regexp)); // xAF +``` + +### Example: Chinese hieroglyphs + +Let's look for Chinese hieroglyphs. + +There's a unicode property `Script` (a writing system), that may have a value: `Cyrillic`, `Greek`, `Arabic`, `Han` (Chinese) and so on, [here's the full list]("https://en.wikipedia.org/wiki/Script_(Unicode)"). + +To look for characters in a given writing system we should use `pattern:Script=`, e.g. for Cyrillic letters: `pattern:\p{sc=Cyrillic}`, for Chinese hieroglyphs: `pattern:\p{sc=Han}`, and so on: + +```js run +let regexp = /\p{sc=Han}/gu; // returns Chinese hieroglyphs + +let str = `Hello Привет 你好 123_456`; + +alert( str.match(regexp) ); // 你,好 +``` + +### Example: currency + +Characters that denote a currency, such as `$`, `€`, `¥`, have unicode property `pattern:\p{Currency_Symbol}`, the short alias: `pattern:\p{Sc}`. + +Let's use it to look for prices in the format "currency, followed by a digit": + +```js run +let regexp = /\p{Sc}\d/gu; + +let str = `Prices: $2, €1, ¥9`; + +alert( str.match(regexp) ); // $2,€1,¥9 +``` + +Later, in the article we'll see how to look for numbers that contain many digits. + +## Summary + +Flag `pattern:u` enables the support of Unicode in regular expressions. + +That means two things: + +1. Characters of 4 bytes are handled correctly: as a single character, not two 2-byte characters. +2. Unicode properties can be used in the search: `\p{…}`. + +With Unicode properties we can look for words in given languages, special characters (quotes, currencies) and so on. diff --git a/9-regular-expressions/12-regexp-anchors/1-start-end/solution.md b/9-regular-expressions/04-regexp-anchors/1-start-end/solution.md similarity index 77% rename from 9-regular-expressions/12-regexp-anchors/1-start-end/solution.md rename to 9-regular-expressions/04-regexp-anchors/1-start-end/solution.md index 1a8cbe9a2..702f992d7 100644 --- a/9-regular-expressions/12-regexp-anchors/1-start-end/solution.md +++ b/9-regular-expressions/04-regexp-anchors/1-start-end/solution.md @@ -1,5 +1,4 @@ - -The empty string is the only match: it starts and immediately finishes. +An empty string is the only match: it starts and immediately finishes. The task once again demonstrates that anchors are not characters, but tests. diff --git a/9-regular-expressions/12-regexp-anchors/1-start-end/task.md b/9-regular-expressions/04-regexp-anchors/1-start-end/task.md similarity index 100% rename from 9-regular-expressions/12-regexp-anchors/1-start-end/task.md rename to 9-regular-expressions/04-regexp-anchors/1-start-end/task.md diff --git a/9-regular-expressions/04-regexp-anchors/article.md b/9-regular-expressions/04-regexp-anchors/article.md new file mode 100644 index 000000000..c34999ee5 --- /dev/null +++ b/9-regular-expressions/04-regexp-anchors/article.md @@ -0,0 +1,52 @@ +# Anchors: string start ^ and end $ + +The caret `pattern:^` and dollar `pattern:$` characters have special meaning in a regexp. They are called "anchors". + +The caret `pattern:^` matches at the beginning of the text, and the dollar `pattern:$` -- at the end. + +For instance, let's test if the text starts with `Mary`: + +```js run +let str1 = "Mary had a little lamb"; +alert( /^Mary/.test(str1) ); // true +``` + +The pattern `pattern:^Mary` means: "string start and then Mary". + +Similar to this, we can test if the string ends with `snow` using `pattern:snow$`: + +```js run +let str1 = "it's fleece was white as snow"; +alert( /snow$/.test(str1) ); // true +``` + +In these particular cases we could use string methods `startsWith/endsWith` instead. Regular expressions should be used for more complex tests. + +## Testing for a full match + +Both anchors together `pattern:^...$` are often used to test whether or not a string fully matches the pattern. For instance, to check if the user input is in the right format. + +Let's check whether or not a string is a time in `12:34` format. That is: two digits, then a colon, and then another two digits. + +In regular expressions language that's `pattern:\d\d:\d\d`: + +```js run +let goodInput = "12:34"; +let badInput = "12:345"; + +let regexp = /^\d\d:\d\d$/; +alert( regexp.test(goodInput) ); // true +alert( regexp.test(badInput) ); // false +``` + +Here the match for `pattern:\d\d:\d\d` must start exactly after the beginning of the text `pattern:^`, and the end `pattern:$` must immediately follow. + +The whole string must be exactly in this format. If there's any deviation or an extra character, the result is `false`. + +Anchors behave differently if flag `pattern:m` is present. We'll see that in the next article. + +```smart header="Anchors have \"zero width\"" +Anchors `pattern:^` and `pattern:$` are tests. They have zero width. + +In other words, they do not match a character, but rather force the regexp engine to check the condition (text start/end). +``` diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/05-regexp-character-sets-and-ranges/article.md deleted file mode 100644 index 7204f2b1c..000000000 --- a/9-regular-expressions/05-regexp-character-sets-and-ranges/article.md +++ /dev/null @@ -1,114 +0,0 @@ -# Sets and ranges [...] - -Several characters or character classes inside square brackets `[…]` mean to "search for any character among given". - -## Sets - -For instance, `pattern:[eao]` means any of the 3 characters: `'a'`, `'e'`, or `'o'`. - -That's called a *set*. Sets can be used in a regexp along with regular characters: - -```js run -// find [t or m], and then "op" -alert( "Mop top".match(/[tm]op/gi) ); // "Mop", "top" -``` - -Please note that although there are multiple characters in the set, they correspond to exactly one character in the match. - -So the example below gives no matches: - -```js run -// find "V", then [o or i], then "la" -alert( "Voila".match(/V[oi]la/) ); // null, no matches -``` - -The pattern assumes: - -- `pattern:V`, -- then *one* of the letters `pattern:[oi]`, -- then `pattern:la`. - -So there would be a match for `match:Vola` or `match:Vila`. - -## Ranges - -Square brackets may also contain *character ranges*. - -For instance, `pattern:[a-z]` is a character in range from `a` to `z`, and `pattern:[0-5]` is a digit from `0` to `5`. - -In the example below we're searching for `"x"` followed by two digits or letters from `A` to `F`: - -```js run -alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF -``` - -Please note that in the word `subject:Exception` there's a substring `subject:xce`. It didn't match the pattern, because the letters are lowercase, while in the set `pattern:[0-9A-F]` they are uppercase. - -If we want to find it too, then we can add a range `a-f`: `pattern:[0-9A-Fa-f]`. The `i` flag would allow lowercase too. - -**Character classes are shorthands for certain character sets.** - -For instance: - -- **\d** -- is the same as `pattern:[0-9]`, -- **\w** -- is the same as `pattern:[a-zA-Z0-9_]`, -- **\s** -- is the same as `pattern:[\t\n\v\f\r ]` plus few other unicode space characters. - -We can use character classes inside `[…]` as well. - -For instance, we want to match all wordly characters or a dash, for words like "twenty-third". We can't do it with `pattern:\w+`, because `pattern:\w` class does not include a dash. But we can use `pattern:[\w-]`. - -We also can use several classes, for example `pattern:[\s\S]` matches spaces or non-spaces -- any character. That's wider than a dot `"."`, because the dot matches any character except a newline (unless `s` flag is set). - -## Excluding ranges - -Besides normal ranges, there are "excluding" ranges that look like `pattern:[^…]`. - -They are denoted by a caret character `^` at the start and match any character *except the given ones*. - -For instance: - -- `pattern:[^aeyo]` -- any character except `'a'`, `'e'`, `'y'` or `'o'`. -- `pattern:[^0-9]` -- any character except a digit, the same as `\D`. -- `pattern:[^\s]` -- any non-space character, same as `\S`. - -The example below looks for any characters except letters, digits and spaces: - -```js run -alert( "alice15@gmail.com".match(/[^\d\sA-Z]/gi) ); // @ and . -``` - -## No escaping in […] - -Usually when we want to find exactly the dot character, we need to escape it like `pattern:\.`. And if we need a backslash, then we use `pattern:\\`. - -In square brackets the vast majority of special characters can be used without escaping: - -- A dot `pattern:'.'`. -- A plus `pattern:'+'`. -- Parentheses `pattern:'( )'`. -- Dash `pattern:'-'` in the beginning or the end (where it does not define a range). -- A caret `pattern:'^'` if not in the beginning (where it means exclusion). -- And the opening square bracket `pattern:'['`. - -In other words, all special characters are allowed except where they mean something for square brackets. - -A dot `"."` inside square brackets means just a dot. The pattern `pattern:[.,]` would look for one of characters: either a dot or a comma. - -In the example below the regexp `pattern:[-().^+]` looks for one of the characters `-().^+`: - -```js run -// No need to escape -let reg = /[-().^+]/g; - -alert( "1 + 2 - 3".match(reg) ); // Matches +, - -``` - -...But if you decide to escape them "just in case", then there would be no harm: - -```js run -// Escaped everything -let reg = /[\-\(\)\.\^\+]/g; - -alert( "1 + 2 - 3".match(reg) ); // also works: +, - -``` diff --git a/9-regular-expressions/05-regexp-multiline-mode/article.md b/9-regular-expressions/05-regexp-multiline-mode/article.md new file mode 100644 index 000000000..539f9fa23 --- /dev/null +++ b/9-regular-expressions/05-regexp-multiline-mode/article.md @@ -0,0 +1,87 @@ +# Multiline mode of anchors ^ $, flag "m" + +The multiline mode is enabled by the flag `pattern:m`. + +It only affects the behavior of `pattern:^` and `pattern:$`. + +In the multiline mode they match not only at the beginning and the end of the string, but also at start/end of line. + +## Searching at line start ^ + +In the example below the text has multiple lines. The pattern `pattern:/^\d/gm` takes a digit from the beginning of each line: + +```js run +let str = `1st place: Winnie +2nd place: Piglet +3rd place: Eeyore`; + +*!* +alert( str.match(/^\d/gm) ); // 1, 2, 3 +*/!* +``` + +Without the flag `pattern:m` only the first digit is matched: + +```js run +let str = `1st place: Winnie +2nd place: Piglet +3rd place: Eeyore`; + +*!* +alert( str.match(/^\d/g) ); // 1 +*/!* +``` + +That's because by default a caret `pattern:^` only matches at the beginning of the text, and in the multiline mode -- at the start of any line. + +```smart +"Start of a line" formally means "immediately after a line break": the test `pattern:^` in multiline mode matches at all positions preceeded by a newline character `\n`. + +And at the text start. +``` + +## Searching at line end $ + +The dollar sign `pattern:$` behaves similarly. + +The regular expression `pattern:\d$` finds the last digit in every line + +```js run +let str = `Winnie: 1 +Piglet: 2 +Eeyore: 3`; + +alert( str.match(/\d$/gm) ); // 1,2,3 +``` + +Without the flag `m`, the dollar `pattern:$` would only match the end of the whole text, so only the very last digit would be found. + +```smart +"End of a line" formally means "immediately before a line break": the test `pattern:$` in multiline mode matches at all positions succeeded by a newline character `\n`. + +And at the text end. +``` + +## Searching for \n instead of ^ $ + +To find a newline, we can use not only anchors `pattern:^` and `pattern:$`, but also the newline character `\n`. + +What's the difference? Let's see an example. + +Here we search for `pattern:\d\n` instead of `pattern:\d$`: + +```js run +let str = `Winnie: 1 +Piglet: 2 +Eeyore: 3`; + +alert( str.match(/\d\n/gm) ); // 1\n,2\n +``` + +As we can see, there are 2 matches instead of 3. + +That's because there's no newline after `subject:3` (there's text end though, so it matches `pattern:$`). + +Another difference: now every match includes a newline character `match:\n`. Unlike the anchors `pattern:^` `pattern:$`, that only test the condition (start/end of a line), `\n` is a character, so it becomes a part of the result. + +So, a `\n` in the pattern is used when we need newline characters in the result, while anchors are used to find something at the beginning/end of a line. diff --git a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md similarity index 100% rename from 9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md rename to 9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md diff --git a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md similarity index 99% rename from 9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md rename to 9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md index 5e32b9c48..95ab5777d 100644 --- a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md +++ b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md @@ -5,4 +5,5 @@ The time has a format: `hours:minutes`. Both hours and minutes has two digits, l Make a regexp to find time in the string: `subject:Breakfast at 09:00 in the room 123:456.` P.S. In this task there's no need to check time correctness yet, so `25:99` can also be a valid result. + P.P.S. The regexp shouldn't match `123:456`. diff --git a/9-regular-expressions/06-regexp-boundary/article.md b/9-regular-expressions/06-regexp-boundary/article.md new file mode 100644 index 000000000..e4df252a4 --- /dev/null +++ b/9-regular-expressions/06-regexp-boundary/article.md @@ -0,0 +1,52 @@ +# Word boundary: \b + +A word boundary `pattern:\b` is a test, just like `pattern:^` and `pattern:$`. + +When the regexp engine (program module that implements searching for regexps) comes across `pattern:\b`, it checks that the position in the string is a word boundary. + +There are three different positions that qualify as word boundaries: + +- At string start, if the first string character is a word character `pattern:\w`. +- Between two characters in the string, where one is a word character `pattern:\w` and the other is not. +- At string end, if the last string character is a word character `pattern:\w`. + +For instance, regexp `pattern:\bJava\b` will be found in `subject:Hello, Java!`, where `subject:Java` is a standalone word, but not in `subject:Hello, JavaScript!`. + +```js run +alert( "Hello, Java!".match(/\bJava\b/) ); // Java +alert( "Hello, JavaScript!".match(/\bJava\b/) ); // null +``` + +In the string `subject:Hello, Java!` following positions correspond to `pattern:\b`: + +![](hello-java-boundaries.svg) + +So, it matches the pattern `pattern:\bHello\b`, because: + +1. At the beginning of the string matches the first test `pattern:\b`. +2. Then matches the word `pattern:Hello`. +3. Then the test `pattern:\b` matches again, as we're between `subject:o` and a space. + +The pattern `pattern:\bJava\b` would also match. But not `pattern:\bHell\b` (because there's no word boundary after `l`) and not `Java!\b` (because the exclamation sign is not a wordly character `pattern:\w`, so there's no word boundary after it). + +```js run +alert( "Hello, Java!".match(/\bHello\b/) ); // Hello +alert( "Hello, Java!".match(/\bJava\b/) ); // Java +alert( "Hello, Java!".match(/\bHell\b/) ); // null (no match) +alert( "Hello, Java!".match(/\bJava!\b/) ); // null (no match) +``` + +We can use `pattern:\b` not only with words, but with digits as well. + +For example, the pattern `pattern:\b\d\d\b` looks for standalone 2-digit numbers. In other words, it looks for 2-digit numbers that are surrounded by characters different from `pattern:\w`, such as spaces or punctuation (or text start/end). + +```js run +alert( "1 23 456 78".match(/\b\d\d\b/g) ); // 23,78 +alert( "12,34,56".match(/\b\d\d\b/g) ); // 12,34,56 +``` + +```warn header="Word boundary `pattern:\b` doesn't work for non-latin alphabets" +The word boundary test `pattern:\b` checks that there should be `pattern:\w` on the one side from the position and "not `pattern:\w`" - on the other side. + +But `pattern:\w` means a latin letter `a-z` (or a digit or an underscore), so the test doesn't work for other characters, e.g. cyrillic letters or hieroglyphs. +``` diff --git a/9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg b/9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg new file mode 100644 index 000000000..3d421a323 --- /dev/null +++ b/9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/9-regular-expressions/04-regexp-escaping/article.md b/9-regular-expressions/07-regexp-escaping/article.md similarity index 55% rename from 9-regular-expressions/04-regexp-escaping/article.md rename to 9-regular-expressions/07-regexp-escaping/article.md index 909cd4856..7bf989471 100644 --- a/9-regular-expressions/04-regexp-escaping/article.md +++ b/9-regular-expressions/07-regexp-escaping/article.md @@ -1,7 +1,7 @@ # Escaping, special characters -As we've seen, a backslash `"\"` is used to denote character classes. So it's a special character in regexps (just like in a regular string). +As we've seen, a backslash `pattern:\` is used to denote character classes, e.g. `pattern:\d`. So it's a special character in regexps (just like in regular strings). There are other special characters as well, that have special meaning in a regexp. They are used to do more powerful searches. Here's a full list of them: `pattern:[ \ ^ $ . | ? * + ( )`. @@ -9,7 +9,7 @@ Don't try to remember the list -- soon we'll deal with each of them separately a ## Escaping -Let's say we want to find a dot literally. Not "any character", but just a dot. +Let's say we want to find literally a dot. Not "any character", but just a dot. To use a special character as a regular one, prepend it with a backslash: `pattern:\.`. @@ -43,11 +43,11 @@ Here's what a search for a slash `'/'` looks like: alert( "/".match(/\//) ); // '/' ``` -On the other hand, if we're not using `/.../`, but create a regexp using `new RegExp`, then we don't need to escape it: +On the other hand, if we're not using `pattern:/.../`, but create a regexp using `new RegExp`, then we don't need to escape it: ```js run -alert( "/".match(new RegExp("/")) ); // '/' -``` +alert( "/".match(new RegExp("/")) ); // finds / +``` ## new RegExp @@ -56,30 +56,30 @@ If we are creating a regular expression with `new RegExp`, then we don't have to For instance, consider this: ```js run -let reg = new RegExp("\d\.\d"); +let regexp = new RegExp("\d\.\d"); -alert( "Chapter 5.1".match(reg) ); // null +alert( "Chapter 5.1".match(regexp) ); // null ``` -The search worked with `pattern:/\d\.\d/`, but with `new RegExp("\d\.\d")` it doesn't work, why? +The similar search in one of previous examples worked with `pattern:/\d\.\d/`, but `new RegExp("\d\.\d")` doesn't work, why? -The reason is that backslashes are "consumed" by a string. Remember, regular strings have their own special characters like `\n`, and a backslash is used for escaping. +The reason is that backslashes are "consumed" by a string. As we may recall, regular strings have their own special characters, such as `\n`, and a backslash is used for escaping. -Please, take a look, what "\d\.\d" really is: +Here's how "\d\.\d" is preceived: ```js run alert("\d\.\d"); // d.d ``` -The quotes "consume" backslashes and interpret them, for instance: +String quotes "consume" backslashes and interpret them on their own, for instance: - `\n` -- becomes a newline character, - `\u1234` -- becomes the Unicode character with such code, -- ...And when there's no special meaning: like `\d` or `\z`, then the backslash is simply removed. +- ...And when there's no special meaning: like `pattern:\d` or `\z`, then the backslash is simply removed. -So the call to `new RegExp` gets a string without backslashes. That's why the search doesn't work! +So `new RegExp` gets a string without backslashes. That's why the search doesn't work! -To fix it, we need to double backslashes, because quotes turn `\\` into `\`: +To fix it, we need to double backslashes, because string quotes turn `\\` into `\`: ```js run *!* @@ -87,13 +87,13 @@ let regStr = "\\d\\.\\d"; */!* alert(regStr); // \d\.\d (correct now) -let reg = new RegExp(regStr); +let regexp = new RegExp(regStr); -alert( "Chapter 5.1".match(reg) ); // 5.1 +alert( "Chapter 5.1".match(regexp) ); // 5.1 ``` ## Summary -- To search special characters `pattern:[ \ ^ $ . | ? * + ( )` literally, we need to prepend them with `\` ("escape them"). +- To search for special characters `pattern:[ \ ^ $ . | ? * + ( )` literally, we need to prepend them with a backslash `\` ("escape them"). - We also need to escape `/` if we're inside `pattern:/.../` (but not inside `new RegExp`). -- When passing a string `new RegExp`, we need to double backslashes `\\`, cause strings consume one of them. +- When passing a string `new RegExp`, we need to double backslashes `\\`, cause string quotes consume one of them. diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/solution.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/solution.md similarity index 65% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/solution.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/solution.md index a6d71f661..378471611 100644 --- a/9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/solution.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/solution.md @@ -5,7 +5,7 @@ Answers: **no, yes**. ```js run alert( "Java".match(/Java[^script]/) ); // null ``` -- Yes, because the regexp is case-insensitive, the `pattern:[^script]` part matches the character `"S"`. +- Yes, because the part `pattern:[^script]` part matches the character `"S"`. It's not one of `pattern:script`. As the regexp is case-sensitive (no `pattern:i` flag), it treats `"S"` as a different character from `"s"`. ```js run alert( "JavaScript".match(/Java[^script]/) ); // "JavaS" diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/task.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/task.md similarity index 100% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/task.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/task.md diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md similarity index 69% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md index 91568d033..69ade1b19 100644 --- a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md @@ -1,8 +1,8 @@ Answer: `pattern:\d\d[-:]\d\d`. ```js run -let reg = /\d\d[-:]\d\d/g; -alert( "Breakfast at 09:00. Dinner at 21-30".match(reg) ); // 09:00, 21-30 +let regexp = /\d\d[-:]\d\d/g; +alert( "Breakfast at 09:00. Dinner at 21-30".match(regexp) ); // 09:00, 21-30 ``` Please note that the dash `pattern:'-'` has a special meaning in square brackets, but only between other characters, not when it's in the beginning or at the end, so we don't need to escape it. diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md similarity index 76% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md index 868115bdf..c8441caf4 100644 --- a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md @@ -5,8 +5,8 @@ The time can be in the format `hours:minutes` or `hours-minutes`. Both hours and Write a regexp to find time: ```js -let reg = /your regexp/g; -alert( "Breakfast at 09:00. Dinner at 21-30".match(reg) ); // 09:00, 21-30 +let regexp = /your regexp/g; +alert( "Breakfast at 09:00. Dinner at 21-30".match(regexp) ); // 09:00, 21-30 ``` P.S. In this task we assume that the time is always correct, there's no need to filter out bad strings like "45:67". Later we'll deal with that too. diff --git a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md new file mode 100644 index 000000000..cb6a27e9d --- /dev/null +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -0,0 +1,197 @@ +# Sets and ranges [...] + +Several characters or character classes inside square brackets `[…]` mean to "search for any character among given". + +## Sets + +For instance, `pattern:[eao]` means any of the 3 characters: `'a'`, `'e'`, or `'o'`. + +That's called a *set*. Sets can be used in a regexp along with regular characters: + +```js run +// find [t or m], and then "op" +alert( "Mop top".match(/[tm]op/gi) ); // "Mop", "top" +``` + +Please note that although there are multiple characters in the set, they correspond to exactly one character in the match. + +So the example below gives no matches: + +```js run +// find "V", then [o or i], then "la" +alert( "Voila".match(/V[oi]la/) ); // null, no matches +``` + +The pattern searches for: + +- `pattern:V`, +- then *one* of the letters `pattern:[oi]`, +- then `pattern:la`. + +So there would be a match for `match:Vola` or `match:Vila`. + +## Ranges + +Square brackets may also contain *character ranges*. + +For instance, `pattern:[a-z]` is a character in range from `a` to `z`, and `pattern:[0-5]` is a digit from `0` to `5`. + +In the example below we're searching for `"x"` followed by two digits or letters from `A` to `F`: + +```js run +alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF +``` + +Here `pattern:[0-9A-F]` has two ranges: it searches for a character that is either a digit from `0` to `9` or a letter from `A` to `F`. + +If we'd like to look for lowercase letters as well, we can add the range `a-f`: `pattern:[0-9A-Fa-f]`. Or add the flag `pattern:i`. + +We can also use character classes inside `[…]`. + +For instance, if we'd like to look for a wordly character `pattern:\w` or a hyphen `pattern:-`, then the set is `pattern:[\w-]`. + +Combining multiple classes is also possible, e.g. `pattern:[\s\d]` means "a space character or a digit". + +```smart header="Character classes are shorthands for certain character sets" +For instance: + +- **\d** -- is the same as `pattern:[0-9]`, +- **\w** -- is the same as `pattern:[a-zA-Z0-9_]`, +- **\s** -- is the same as `pattern:[\t\n\v\f\r ]`, plus few other rare unicode space characters. +``` + +### Example: multi-language \w + +As the character class `pattern:\w` is a shorthand for `pattern:[a-zA-Z0-9_]`, it can't find Chinese hieroglyphs, Cyrillic letters, etc. + +We can write a more universal pattern, that looks for wordly characters in any language. That's easy with unicode properties: `pattern:[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]`. + +Let's decipher it. Similar to `pattern:\w`, we're making a set of our own that includes characters with following unicode properties: + +- `Alphabetic` (`Alpha`) - for letters, +- `Mark` (`M`) - for accents, +- `Decimal_Number` (`Nd`) - for digits, +- `Connector_Punctuation` (`Pc`) - for the underscore `'_'` and similar characters, +- `Join_Control` (`Join_C`) - two special codes `200c` and `200d`, used in ligatures, e.g. in Arabic. + +An example of use: + +```js run +let regexp = /[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]/gu; + +let str = `Hi 你好 12`; + +// finds all letters and digits: +alert( str.match(regexp) ); // H,i,你,好,1,2 +``` + +Of course, we can edit this pattern: add unicode properties or remove them. Unicode properties are covered in more details in the article . + +```warn header="Unicode properties aren't supported in Edge and Firefox" +Unicode properties `pattern:p{…}` are not yet implemented in Edge and Firefox. If we really need them, we can use library [XRegExp](http://xregexp.com/). + +Or just use ranges of characters in a language that interests us, e.g. `pattern:[а-я]` for Cyrillic letters. +``` + +## Excluding ranges + +Besides normal ranges, there are "excluding" ranges that look like `pattern:[^…]`. + +They are denoted by a caret character `^` at the start and match any character *except the given ones*. + +For instance: + +- `pattern:[^aeyo]` -- any character except `'a'`, `'e'`, `'y'` or `'o'`. +- `pattern:[^0-9]` -- any character except a digit, the same as `pattern:\D`. +- `pattern:[^\s]` -- any non-space character, same as `\S`. + +The example below looks for any characters except letters, digits and spaces: + +```js run +alert( "alice15@gmail.com".match(/[^\d\sA-Z]/gi) ); // @ and . +``` + +## Escaping in […] + +Usually when we want to find exactly a special character, we need to escape it like `pattern:\.`. And if we need a backslash, then we use `pattern:\\`, and so on. + +In square brackets we can use the vast majority of special characters without escaping: + +- Symbols `pattern:. + ( )` never need escaping. +- A hyphen `pattern:-` is not escaped in the beginning or the end (where it does not define a range). +- A caret `pattern:^` is only escaped in the beginning (where it means exclusion). +- The closing square bracket `pattern:]` is always escaped (if we need to look for that symbol). + +In other words, all special characters are allowed without escaping, except when they mean something for square brackets. + +A dot `.` inside square brackets means just a dot. The pattern `pattern:[.,]` would look for one of characters: either a dot or a comma. + +In the example below the regexp `pattern:[-().^+]` looks for one of the characters `-().^+`: + +```js run +// No need to escape +let regexp = /[-().^+]/g; + +alert( "1 + 2 - 3".match(regexp) ); // Matches +, - +``` + +...But if you decide to escape them "just in case", then there would be no harm: + +```js run +// Escaped everything +let regexp = /[\-\(\)\.\^\+]/g; + +alert( "1 + 2 - 3".match(regexp) ); // also works: +, - +``` + +## Ranges and flag "u" + +If there are surrogate pairs in the set, flag `pattern:u` is required for them to work correctly. + +For instance, let's look for `pattern:[𝒳𝒴]` in the string `subject:𝒳`: + +```js run +alert( '𝒳'.match(/[𝒳𝒴]/) ); // shows a strange character, like [?] +// (the search was performed incorrectly, half-character returned) +``` + +The result is incorrect, because by default regular expressions "don't know" about surrogate pairs. + +The regular expression engine thinks that `[𝒳𝒴]` -- are not two, but four characters: +1. left half of `𝒳` `(1)`, +2. right half of `𝒳` `(2)`, +3. left half of `𝒴` `(3)`, +4. right half of `𝒴` `(4)`. + +We can see their codes like this: + +```js run +for(let i=0; i<'𝒳𝒴'.length; i++) { + alert('𝒳𝒴'.charCodeAt(i)); // 55349, 56499, 55349, 56500 +}; +``` + +So, the example above finds and shows the left half of `𝒳`. + +If we add flag `pattern:u`, then the behavior will be correct: + +```js run +alert( '𝒳'.match(/[𝒳𝒴]/u) ); // 𝒳 +``` + +The similar situation occurs when looking for a range, such as `[𝒳-𝒴]`. + +If we forget to add flag `pattern:u`, there will be an error: + +```js run +'𝒳'.match(/[𝒳-𝒴]/); // Error: Invalid regular expression +``` + +The reason is that without flag `pattern:u` surrogate pairs are perceived as two characters, so `[𝒳-𝒴]` is interpreted as `[<55349><56499>-<55349><56500>]` (every surrogate pair is replaced with its codes). Now it's easy to see that the range `56499-55349` is invalid: its starting code `56499` is greater than the end `55349`. That's the formal reason for the error. + +With the flag `pattern:u` the pattern works correctly: + +```js run +// look for characters from 𝒳 to 𝒵 +alert( '𝒴'.match(/[𝒳-𝒵]/u) ); // 𝒴 +``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md b/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md deleted file mode 100644 index c066f3e36..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md +++ /dev/null @@ -1,17 +0,0 @@ -We need to find the beginning of the comment `match:`. - -The first idea could be `pattern:` -- the lazy quantifier makes the dot stop right before `match:-->`. - -But a dot in JavaScript means "any symbol except the newline". So multiline comments won't be found. - -We can use `pattern:[\s\S]` instead of the dot to match "anything": - -```js run -let reg = //g; - -let str = `... .. .. -`; - -alert( str.match(reg) ); // '', '' -``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg deleted file mode 100644 index 65e490e97..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg +++ /dev/null @@ -1,16 +0,0 @@ - - - - witch_greedy1.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg deleted file mode 100644 index 1ee351588..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg +++ /dev/null @@ -1,16 +0,0 @@ - - - - witch_greedy2.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg deleted file mode 100644 index e60ed1618..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg +++ /dev/null @@ -1,16 +0,0 @@ - - - - witch_greedy3.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg deleted file mode 100644 index 630a62230..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg +++ /dev/null @@ -1,19 +0,0 @@ - - - - witch_greedy4.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg deleted file mode 100644 index fa8db5846..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg +++ /dev/null @@ -1,19 +0,0 @@ - - - - witch_greedy5.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg deleted file mode 100644 index c0165e88d..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg +++ /dev/null @@ -1,17 +0,0 @@ - - - - witch_greedy6.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg deleted file mode 100644 index 61506bb5a..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg +++ /dev/null @@ -1,18 +0,0 @@ - - - - witch_lazy3.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg deleted file mode 100644 index fecc1a5ae..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg +++ /dev/null @@ -1,18 +0,0 @@ - - - - witch_lazy4.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg deleted file mode 100644 index b75a9d7bd..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg +++ /dev/null @@ -1,16 +0,0 @@ - - - - witch_lazy5.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg b/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg deleted file mode 100644 index 867e30c85..000000000 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg +++ /dev/null @@ -1,18 +0,0 @@ - - - - witch_lazy6.svg - Created with sketchtool. - - - - a "witch" and her "broom" is one - - - - - - - - - \ No newline at end of file diff --git a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md b/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md deleted file mode 100644 index d653ff970..000000000 --- a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md +++ /dev/null @@ -1,29 +0,0 @@ -A regexp to search 3-digit color `#abc`: `pattern:/#[a-f0-9]{3}/i`. - -We can add exactly 3 more optional hex digits. We don't need more or less. Either we have them or we don't. - -The simplest way to add them -- is to append to the regexp: `pattern:/#[a-f0-9]{3}([a-f0-9]{3})?/i` - -We can do it in a smarter way though: `pattern:/#([a-f0-9]{3}){1,2}/i`. - -Here the regexp `pattern:[a-f0-9]{3}` is in parentheses to apply the quantifier `pattern:{1,2}` to it as a whole. - -In action: - -```js run -let reg = /#([a-f0-9]{3}){1,2}/gi; - -let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; - -alert( str.match(reg) ); // #3f3 #AA00ef #abc -``` - -There's a minor problem here: the pattern found `match:#abc` in `subject:#abcd`. To prevent that we can add `pattern:\b` to the end: - -```js run -let reg = /#([a-f0-9]{3}){1,2}\b/gi; - -let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; - -alert( str.match(reg) ); // #3f3 #AA00ef -``` diff --git a/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/solution.md b/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/solution.md deleted file mode 100644 index 23065413e..000000000 --- a/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/solution.md +++ /dev/null @@ -1,18 +0,0 @@ - -An non-negative integer number is `pattern:\d+`. We should exclude `0` as the first digit, as we don't need zero, but we can allow it in further digits. - -So that gives us `pattern:[1-9]\d*`. - -A decimal part is: `pattern:\.\d+`. - -Because the decimal part is optional, let's put it in parentheses with the quantifier `pattern:'?'`. - -Finally we have the regexp: `pattern:[1-9]\d*(\.\d+)?`: - -```js run -let reg = /[1-9]\d*(\.\d+)?/g; - -let str = "1.5 0 -5 12. 123.4."; - -alert( str.match(reg) ); // 1.5, 0, 12, 123.4 -``` diff --git a/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/task.md b/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/task.md deleted file mode 100644 index ad8c81eae..000000000 --- a/9-regular-expressions/09-regexp-groups/3-find-decimal-positive-numbers/task.md +++ /dev/null @@ -1,12 +0,0 @@ -# Find positive numbers - -Create a regexp that looks for positive numbers, including those without a decimal point. - -An example of use: -```js -let reg = /your regexp/g; - -let str = "1.5 0 -5 12. 123.4."; - -alert( str.match(reg) ); // 1.5, 12, 123.4 (ignores 0 and -5) -``` diff --git a/9-regular-expressions/09-regexp-groups/article.md b/9-regular-expressions/09-regexp-groups/article.md deleted file mode 100644 index 216b359ab..000000000 --- a/9-regular-expressions/09-regexp-groups/article.md +++ /dev/null @@ -1,237 +0,0 @@ -# Capturing groups - -A part of a pattern can be enclosed in parentheses `pattern:(...)`. This is called a "capturing group". - -That has two effects: - -1. It allows to place a part of the match into a separate array. -2. If we put a quantifier after the parentheses, it applies to the parentheses as a whole, not the last character. - -## Example - -In the example below the pattern `pattern:(go)+` finds one or more `match:'go'`: - -```js run -alert( 'Gogogo now!'.match(/(go)+/i) ); // "Gogogo" -``` - -Without parentheses, the pattern `pattern:/go+/` means `subject:g`, followed by `subject:o` repeated one or more times. For instance, `match:goooo` or `match:gooooooooo`. - -Parentheses group the word `pattern:(go)` together. - -Let's make something more complex -- a regexp to match an email. - -Examples of emails: - -``` -my@mail.com -john.smith@site.com.uk -``` - -The pattern: `pattern:[-.\w]+@([\w-]+\.)+[\w-]{2,20}`. - -1. The first part `pattern:[-.\w]+` (before `@`) may include any alphanumeric word characters, a dot and a dash, to match `match:john.smith`. -2. Then `pattern:@`, and the domain. It may be a subdomain like `host.site.com.uk`, so we match it as "a word followed by a dot `pattern:([\w-]+\.)` (repeated), and then the last part must be a word: `match:com` or `match:uk` (but not very long: 2-20 characters). - -That regexp is not perfect, but good enough to fix errors or occasional mistypes. - -For instance, we can find all emails in the string: - -```js run -let reg = /[-.\w]+@([\w-]+\.)+[\w-]{2,20}/g; - -alert("my@mail.com @ his@site.com.uk".match(reg)); // my@mail.com, his@site.com.uk -``` - -In this example parentheses were used to make a group for repeating `pattern:(...)+`. But there are other uses too, let's see them. - -## Contents of parentheses - -Parentheses are numbered from left to right. The search engine remembers the content of each and allows to reference it in the pattern or in the replacement string. - -For instance, we'd like to find HTML tags `pattern:<.*?>`, and process them. - -Let's wrap the inner content into parentheses, like this: `pattern:<(.*?)>`. - -We'll get them into an array: - -```js run -let str = '

Hello, world!

'; -let reg = /<(.*?)>/; - -alert( str.match(reg) ); // Array: ["

", "h1"] -``` - -The call to [String#match](mdn:js/String/match) returns groups only if the regexp has no `pattern:/.../g` flag. - -If we need all matches with their groups then we can use `.matchAll` or `regexp.exec` as described in : - -```js run -let str = '

Hello, world!

'; - -// two matches: opening

and closing

tags -let reg = /<(.*?)>/g; - -let matches = Array.from( str.matchAll(reg) ); - -alert(matches[0]); // Array: ["

", "h1"] -alert(matches[1]); // Array: ["

", "/h1"] -``` - -Here we have two matches for `pattern:<(.*?)>`, each of them is an array with the full match and groups. - -## Nested groups - -Parentheses can be nested. In this case the numbering also goes from left to right. - -For instance, when searching a tag in `subject:` we may be interested in: - -1. The tag content as a whole: `match:span class="my"`. -2. The tag name: `match:span`. -3. The tag attributes: `match:class="my"`. - -Let's add parentheses for them: - -```js run -let str = ''; - -let reg = /<(([a-z]+)\s*([^>]*))>/; - -let result = str.match(reg); -alert(result); // , span class="my", span, class="my" -``` - -Here's how groups look: - -![](regexp-nested-groups.svg) - -At the zero index of the `result` is always the full match. - -Then groups, numbered from left to right. Whichever opens first gives the first group `result[1]`. Here it encloses the whole tag content. - -Then in `result[2]` goes the group from the second opening `pattern:(` till the corresponding `pattern:)` -- tag name, then we don't group spaces, but group attributes for `result[3]`. - -**If a group is optional and doesn't exist in the match, the corresponding `result` index is present (and equals `undefined`).** - -For instance, let's consider the regexp `pattern:a(z)?(c)?`. It looks for `"a"` optionally followed by `"z"` optionally followed by `"c"`. - -If we run it on the string with a single letter `subject:a`, then the result is: - -```js run -let match = 'a'.match(/a(z)?(c)?/); - -alert( match.length ); // 3 -alert( match[0] ); // a (whole match) -alert( match[1] ); // undefined -alert( match[2] ); // undefined -``` - -The array has the length of `3`, but all groups are empty. - -And here's a more complex match for the string `subject:ack`: - -```js run -let match = 'ack'.match(/a(z)?(c)?/) - -alert( match.length ); // 3 -alert( match[0] ); // ac (whole match) -alert( match[1] ); // undefined, because there's nothing for (z)? -alert( match[2] ); // c -``` - -The array length is permanent: `3`. But there's nothing for the group `pattern:(z)?`, so the result is `["ac", undefined, "c"]`. - -## Named groups - -Remembering groups by their numbers is hard. For simple patterns it's doable, but for more complex ones we can give names to parentheses. - -That's done by putting `pattern:?` immediately after the opening paren, like this: - -```js run -*!* -let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; -*/!* -let str = "2019-04-30"; - -let groups = str.match(dateRegexp).groups; - -alert(groups.year); // 2019 -alert(groups.month); // 04 -alert(groups.day); // 30 -``` - -As you can see, the groups reside in the `.groups` property of the match. - -We can also use them in replacements, as `pattern:$` (like `$1..9`, but name instead of a digit). - -For instance, let's rearrange the date into `day.month.year`: - -```js run -let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; - -let str = "2019-04-30"; - -let rearranged = str.replace(dateRegexp, '$.$.$'); - -alert(rearranged); // 30.04.2019 -``` - -If we use a function, then named `groups` object is always the last argument: - -```js run -let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; - -let str = "2019-04-30"; - -let rearranged = str.replace(dateRegexp, - (str, year, month, day, offset, input, groups) => - `${groups.day}.${groups.month}.${groups.year}` -); - -alert(rearranged); // 30.04.2019 -``` - -Usually, when we intend to use named groups, we don't need positional arguments of the function. For the majority of real-life cases we only need `str` and `groups`. - -So we can write it a little bit shorter: - -```js -let rearranged = str.replace(dateRegexp, (str, ...args) => { - let {year, month, day} = args.pop(); - alert(str); // 2019-04-30 - alert(year); // 2019 - alert(month); // 04 - alert(day); // 30 -}); -``` - - -## Non-capturing groups with ?: - -Sometimes we need parentheses to correctly apply a quantifier, but we don't want the contents in results. - -A group may be excluded by adding `pattern:?:` in the beginning. - -For instance, if we want to find `pattern:(go)+`, but don't want to remember the contents (`go`) in a separate array item, we can write: `pattern:(?:go)+`. - -In the example below we only get the name "John" as a separate member of the `results` array: - -```js run -let str = "Gogo John!"; -*!* -// exclude Gogo from capturing -let reg = /(?:go)+ (\w+)/i; -*/!* - -let result = str.match(reg); - -alert( result.length ); // 2 -alert( result[1] ); // John -``` - -## Summary - -- Parentheses can be: - - capturing `(...)`, ordered left-to-right, accessible by number. - - named capturing `(?...)`, accessible by name. - - non-capturing `(?:...)`, used only to apply quantifier to the whole groups. diff --git a/9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg b/9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg deleted file mode 100644 index 75ced6ff6..000000000 --- a/9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg +++ /dev/null @@ -1,48 +0,0 @@ - - - - regexp-nested-groups.svg - Created with sketchtool. - - - - < - (( - [a-z]+ - ) - \s* - ( - [^>]* - )) - > - - - - - - - - - 1 - - - span class="my" - - - 2 - - - span - - - - - - 3 - - - class="my" - - - - \ No newline at end of file diff --git a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/solution.md b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/solution.md similarity index 57% rename from 9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/solution.md rename to 9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/solution.md index d4ddb1369..21b8762ec 100644 --- a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/solution.md +++ b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/solution.md @@ -2,8 +2,8 @@ Solution: ```js run -let reg = /\.{3,}/g; -alert( "Hello!... How goes?.....".match(reg) ); // ..., ..... +let regexp = /\.{3,}/g; +alert( "Hello!... How goes?.....".match(regexp) ); // ..., ..... ``` Please note that the dot is a special character, so we have to escape it and insert as `\.`. diff --git a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/task.md b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/task.md similarity index 59% rename from 9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/task.md rename to 9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/task.md index 6fd91bdcf..4140b4a98 100644 --- a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/task.md +++ b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/task.md @@ -9,6 +9,6 @@ Create a regexp to find ellipsis: 3 (or more?) dots in a row. Check it: ```js -let reg = /your regexp/g; -alert( "Hello!... How goes?.....".match(reg) ); // ..., ..... +let regexp = /your regexp/g; +alert( "Hello!... How goes?.....".match(regexp) ); // ..., ..... ``` diff --git a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md similarity index 67% rename from 9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md rename to 9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md index ec871d05c..b9e1f85a5 100644 --- a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md +++ b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md @@ -1,17 +1,17 @@ -We need to look for `#` followed by 6 hexadimal characters. +We need to look for `#` followed by 6 hexadecimal characters. -A hexadimal character can be described as `pattern:[0-9a-fA-F]`. Or if we use the `i` flag, then just `pattern:[0-9a-f]`. +A hexadecimal character can be described as `pattern:[0-9a-fA-F]`. Or if we use the `pattern:i` flag, then just `pattern:[0-9a-f]`. Then we can look for 6 of them using the quantifier `pattern:{6}`. As a result, we have the regexp: `pattern:/#[a-f0-9]{6}/gi`. ```js run -let reg = /#[a-f0-9]{6}/gi; +let regexp = /#[a-f0-9]{6}/gi; let str = "color:#121212; background-color:#AA00ef bad-colors:f#fddee #fd2" -alert( str.match(reg) ); // #121212,#AA00ef +alert( str.match(regexp) ); // #121212,#AA00ef ``` The problem is that it finds the color in longer sequences: diff --git a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/task.md b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/task.md similarity index 71% rename from 9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/task.md rename to 9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/task.md index 1960a09c6..9a1923a7e 100644 --- a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/task.md +++ b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/task.md @@ -1,15 +1,15 @@ # Regexp for HTML colors -Create a regexp to search HTML-colors written as `#ABCDEF`: first `#` and then 6 hexadimal characters. +Create a regexp to search HTML-colors written as `#ABCDEF`: first `#` and then 6 hexadecimal characters. An example of use: ```js -let reg = /...your regexp.../ +let regexp = /...your regexp.../ let str = "color:#121212; background-color:#AA00ef bad-colors:f#fddee #fd2 #12345678"; -alert( str.match(reg) ) // #121212,#AA00ef +alert( str.match(regexp) ) // #121212,#AA00ef ``` P.S. In this task we do not need other color formats like `#123` or `rgb(1,2,3)` etc. diff --git a/9-regular-expressions/07-regexp-quantifiers/article.md b/9-regular-expressions/09-regexp-quantifiers/article.md similarity index 63% rename from 9-regular-expressions/07-regexp-quantifiers/article.md rename to 9-regular-expressions/09-regexp-quantifiers/article.md index 5ab592561..1a7eecfeb 100644 --- a/9-regular-expressions/07-regexp-quantifiers/article.md +++ b/9-regular-expressions/09-regexp-quantifiers/article.md @@ -2,7 +2,7 @@ Let's say we have a string like `+7(903)-123-45-67` and want to find all numbers in it. But unlike before, we are interested not in single digits, but full numbers: `7, 903, 123, 45, 67`. -A number is a sequence of 1 or more digits `\d`. To mark how many we need, we need to append a *quantifier*. +A number is a sequence of 1 or more digits `pattern:\d`. To mark how many we need, we can append a *quantifier*. ## Quantity {n} @@ -12,7 +12,7 @@ A quantifier is appended to a character (or a character class, or a `[...]` set It has a few advanced forms, let's see examples: -The exact count: `{5}` +The exact count: `pattern:{5}` : `pattern:\d{5}` denotes exactly 5 digits, the same as `pattern:\d\d\d\d\d`. The example below looks for a 5-digit number: @@ -23,7 +23,7 @@ The exact count: `{5}` We can add `\b` to exclude longer numbers: `pattern:\b\d{5}\b`. -The range: `{3,5}`, match 3-5 times +The range: `pattern:{3,5}`, match 3-5 times : To find numbers from 3 to 5 digits we can put the limits into curly braces: `pattern:\d{3,5}` ```js run @@ -54,8 +54,8 @@ alert(numbers); // 7,903,123,45,67 There are shorthands for most used quantifiers: -`+` -: Means "one or more", the same as `{1,}`. +`pattern:+` +: Means "one or more", the same as `pattern:{1,}`. For instance, `pattern:\d+` looks for numbers: @@ -65,8 +65,8 @@ There are shorthands for most used quantifiers: alert( str.match(/\d+/g) ); // 7,903,123,45,67 ``` -`?` -: Means "zero or one", the same as `{0,1}`. In other words, it makes the symbol optional. +`pattern:?` +: Means "zero or one", the same as `pattern:{0,1}`. In other words, it makes the symbol optional. For instance, the pattern `pattern:ou?r` looks for `match:o` followed by zero or one `match:u`, and then `match:r`. @@ -78,16 +78,16 @@ There are shorthands for most used quantifiers: alert( str.match(/colou?r/g) ); // color, colour ``` -`*` -: Means "zero or more", the same as `{0,}`. That is, the character may repeat any times or be absent. +`pattern:*` +: Means "zero or more", the same as `pattern:{0,}`. That is, the character may repeat any times or be absent. - For example, `pattern:\d0*` looks for a digit followed by any number of zeroes: + For example, `pattern:\d0*` looks for a digit followed by any number of zeroes (may be many or none): ```js run alert( "100 10 1".match(/\d0*/g) ); // 100, 10, 1 ``` - Compare it with `'+'` (one or more): + Compare it with `pattern:+` (one or more): ```js run alert( "100 10 1".match(/\d0+/g) ); // 100, 10 @@ -98,43 +98,45 @@ There are shorthands for most used quantifiers: Quantifiers are used very often. They serve as the main "building block" of complex regular expressions, so let's see more examples. -Regexp "decimal fraction" (a number with a floating point): `pattern:\d+\.\d+` -: In action: - ```js run - alert( "0 1 12.345 7890".match(/\d+\.\d+/g) ); // 12.345 - ``` +**Regexp for decimal fractions (a number with a floating point): `pattern:\d+\.\d+`** -Regexp "open HTML-tag without attributes", like `` or `

`: `pattern:/<[a-z]+>/i` -: In action: +In action: +```js run +alert( "0 1 12.345 7890".match(/\d+\.\d+/g) ); // 12.345 +``` + +**Regexp for an "opening HTML-tag without attributes", such as `` or `

`.** + +1. The simplest one: `pattern:/<[a-z]+>/i` ```js run alert( " ... ".match(/<[a-z]+>/gi) ); // ``` - We look for character `pattern:'<'` followed by one or more English letters, and then `pattern:'>'`. + The regexp looks for character `pattern:'<'` followed by one or more Latin letters, and then `pattern:'>'`. + +2. Improved: `pattern:/<[a-z][a-z0-9]*>/i` -Regexp "open HTML-tag without attributes" (improved): `pattern:/<[a-z][a-z0-9]*>/i` -: Better regexp: according to the standard, HTML tag name may have a digit at any position except the first one, like `

`. + According to the standard, HTML tag name may have a digit at any position except the first one, like `

`. ```js run alert( "

Hi!

".match(/<[a-z][a-z0-9]*>/gi) ); //

``` -Regexp "opening or closing HTML-tag without attributes": `pattern:/<\/?[a-z][a-z0-9]*>/i` -: We added an optional slash `pattern:/?` before the tag. Had to escape it with a backslash, otherwise JavaScript would think it is the pattern end. +**Regexp "opening or closing HTML-tag without attributes": `pattern:/<\/?[a-z][a-z0-9]*>/i`** - ```js run - alert( "

Hi!

".match(/<\/?[a-z][a-z0-9]*>/gi) ); //

,

- ``` +We added an optional slash `pattern:/?` near the beginning of the pattern. Had to escape it with a backslash, otherwise JavaScript would think it is the pattern end. + +```js run +alert( "

Hi!

".match(/<\/?[a-z][a-z0-9]*>/gi) ); //

,

+``` ```smart header="To make a regexp more precise, we often need make it more complex" We can see one common rule in these examples: the more precise is the regular expression -- the longer and more complex it is. -For instance, for HTML tags we could use a simpler regexp: `pattern:<\w+>`. - -...But because `pattern:\w` means any English letter or a digit or `'_'`, the regexp also matches non-tags, for instance `match:<_>`. So it's much simpler than `pattern:<[a-z][a-z0-9]*>`, but less reliable. +For instance, for HTML tags we could use a simpler regexp: `pattern:<\w+>`. But as HTML has stricter restrictions for a tag name, `pattern:<[a-z][a-z0-9]*>` is more reliable. -Are we ok with `pattern:<\w+>` or we need `pattern:<[a-z][a-z0-9]*>`? +Can we use `pattern:<\w+>` or we need `pattern:<[a-z][a-z0-9]*>`? -In real life both variants are acceptable. Depends on how tolerant we can be to "extra" matches and whether it's difficult or not to filter them out by other means. +In real life both variants are acceptable. Depends on how tolerant we can be to "extra" matches and whether it's difficult or not to remove them from the result by other means. ``` diff --git a/9-regular-expressions/10-regexp-backreferences/article.md b/9-regular-expressions/10-regexp-backreferences/article.md deleted file mode 100644 index eff5cab45..000000000 --- a/9-regular-expressions/10-regexp-backreferences/article.md +++ /dev/null @@ -1,65 +0,0 @@ -# Backreferences in pattern: \n and \k - -We can use the contents of capturing groups `(...)` not only in the result or in the replacement string, but also in the pattern itself. - -## Backreference by number: \n - -A group can be referenced in the pattern using `\n`, where `n` is the group number. - -To make things clear let's consider a task. - -We need to find a quoted string: either a single-quoted `subject:'...'` or a double-quoted `subject:"..."` -- both variants need to match. - -How to look for them? - -We can put both kinds of quotes in the square brackets: `pattern:['"](.*?)['"]`, but it would find strings with mixed quotes, like `match:"...'` and `match:'..."`. That would lead to incorrect matches when one quote appears inside other ones, like the string `subject:"She's the one!"`: - -```js run -let str = `He said: "She's the one!".`; - -let reg = /['"](.*?)['"]/g; - -// The result is not what we expect -alert( str.match(reg) ); // "She' -``` - -As we can see, the pattern found an opening quote `match:"`, then the text is consumed lazily till the other quote `match:'`, that closes the match. - -To make sure that the pattern looks for the closing quote exactly the same as the opening one, we can wrap it into a capturing group and use the backreference. - -Here's the correct code: - -```js run -let str = `He said: "She's the one!".`; - -*!* -let reg = /(['"])(.*?)\1/g; -*/!* - -alert( str.match(reg) ); // "She's the one!" -``` - -Now it works! The regular expression engine finds the first quote `pattern:(['"])` and remembers the content of `pattern:(...)`, that's the first capturing group. - -Further in the pattern `pattern:\1` means "find the same text as in the first group", exactly the same quote in our case. - -Please note: - -- To reference a group inside a replacement string -- we use `$1`, while in the pattern -- a backslash `\1`. -- If we use `?:` in the group, then we can't reference it. Groups that are excluded from capturing `(?:...)` are not remembered by the engine. - -## Backreference by name: `\k` - -For named groups, we can backreference by `\k`. - -The same example with the named group: - -```js run -let str = `He said: "She's the one!".`; - -*!* -let reg = /(?['"])(.*?)\k/g; -*/!* - -alert( str.match(reg) ); // "She's the one!" -``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/solution.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/solution.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/solution.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/task.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/task.md diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md new file mode 100644 index 000000000..0244963d1 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md @@ -0,0 +1,15 @@ +We need to find the beginning of the comment `match:`. + +An acceptable variant is `pattern:` -- the lazy quantifier makes the dot stop right before `match:-->`. We also need to add flag `pattern:s` for the dot to include newlines. + +Otherwise multiline comments won't be found: + +```js run +let regexp = //gs; + +let str = `... .. .. +`; + +alert( str.match(regexp) ); // '', '' +``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/task.md similarity index 56% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/task.md index 81fd5c634..551d9c725 100644 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/task.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/task.md @@ -3,11 +3,11 @@ Find all HTML comments in the text: ```js -let reg = /your regexp/g; +let regexp = /your regexp/g; let str = `... .. .. `; -alert( str.match(reg) ); // '', '' +alert( str.match(regexp) ); // '', '' ``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md similarity index 51% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md index c453926fa..b4d9f7496 100644 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md @@ -2,9 +2,9 @@ The solution is `pattern:<[^<>]+>`. ```js run -let reg = /<[^<>]+>/g; +let regexp = /<[^<>]+>/g; let str = '<> '; -alert( str.match(reg) ); // '', '', '' +alert( str.match(regexp) ); // '', '', '' ``` diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md similarity index 74% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md index e3c39c373..8e96c921d 100644 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md @@ -5,11 +5,11 @@ Create a regular expression to find all (opening and closing) HTML tags with the An example of use: ```js run -let reg = /your regexp/g; +let regexp = /your regexp/g; let str = '<> '; -alert( str.match(reg) ); // '', '', '' +alert( str.match(regexp) ); // '', '', '' ``` Here we assume that tag attributes may not contain `<` and `>` (inside squotes too), that simplifies things a bit. diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/article.md b/9-regular-expressions/10-regexp-greedy-and-lazy/article.md similarity index 73% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/article.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/article.md index 0fbcfe559..79abc559d 100644 --- a/9-regular-expressions/08-regexp-greedy-and-lazy/article.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/article.md @@ -8,7 +8,7 @@ Let's take the following task as an example. We have a text and need to replace all quotes `"..."` with guillemet marks: `«...»`. They are preferred for typography in many countries. -For instance: `"Hello, world"` should become `«Hello, world»`. Some countries prefer other quotes, like `„Witam, świat!”` (Polish) or `「你好,世界」` (Chinese), but for our task let's choose `«...»`. +For instance: `"Hello, world"` should become `«Hello, world»`. There exist other quotes, such as `„Witam, świat!”` (Polish) or `「你好,世界」` (Chinese), but for our task let's choose `«...»`. The first thing to do is to locate quoted strings, and then we can replace them. @@ -17,11 +17,11 @@ A regular expression like `pattern:/".+"/g` (a quote, then something, then the o Let's try it: ```js run -let reg = /".+"/g; +let regexp = /".+"/g; let str = 'a "witch" and her "broom" is one'; -alert( str.match(reg) ); // "witch" and her "broom" +alert( str.match(regexp) ); // "witch" and her "broom" ``` ...We can see that it works not as intended! @@ -35,7 +35,7 @@ That can be described as "greediness is the cause of all evil". To find a match, the regular expression engine uses the following algorithm: - For every position in the string - - Match the pattern at that position. + - Try to match the pattern at that position. - If there's no match, go to the next position. These common words do not make it obvious why the regexp fails, so let's elaborate how the search works for the pattern `pattern:".+"`. @@ -44,7 +44,7 @@ These common words do not make it obvious why the regexp fails, so let's elabora The regular expression engine tries to find it at the zero position of the source string `subject:a "witch" and her "broom" is one`, but there's `subject:a` there, so there's immediately no match. - Then it advances: goes to the next positions in the source string and tries to find the first character of the pattern there, and finally finds the quote at the 3rd position: + Then it advances: goes to the next positions in the source string and tries to find the first character of the pattern there, fails again, and finally finds the quote at the 3rd position: ![](witch_greedy1.svg) @@ -54,13 +54,13 @@ These common words do not make it obvious why the regexp fails, so let's elabora ![](witch_greedy2.svg) -3. Then the dot repeats because of the quantifier `pattern:.+`. The regular expression engine builds the match by taking characters one by one while it is possible. +3. Then the dot repeats because of the quantifier `pattern:.+`. The regular expression engine adds to the match one character after another. - ...When it becomes impossible? All characters match the dot, so it only stops when it reaches the end of the string: + ...Until when? All characters match the dot, so it only stops when it reaches the end of the string: ![](witch_greedy3.svg) -4. Now the engine finished repeating for `pattern:.+` and tries to find the next character of the pattern. It's the quote `pattern:"`. But there's a problem: the string has finished, there are no more characters! +4. Now the engine finished repeating `pattern:.+` and tries to find the next character of the pattern. It's the quote `pattern:"`. But there's a problem: the string has finished, there are no more characters! The regular expression engine understands that it took too many `pattern:.+` and starts to *backtrack*. @@ -68,9 +68,9 @@ These common words do not make it obvious why the regexp fails, so let's elabora ![](witch_greedy4.svg) - Now it assumes that `pattern:.+` ends one character before the end and tries to match the rest of the pattern from that position. + Now it assumes that `pattern:.+` ends one character before the string end and tries to match the rest of the pattern from that position. - If there were a quote there, then that would be the end, but the last character is `subject:'e'`, so there's no match. + If there were a quote there, then the search would end, but the last character is `subject:'e'`, so there's no match. 5. ...So the engine decreases the number of repetitions of `pattern:.+` by one more character: @@ -84,19 +84,19 @@ These common words do not make it obvious why the regexp fails, so let's elabora 7. The match is complete. -8. So the first match is `match:"witch" and her "broom"`. The further search starts where the first match ends, but there are no more quotes in the rest of the string `subject:is one`, so no more results. +8. So the first match is `match:"witch" and her "broom"`. If the regular expression has flag `pattern:g`, then the search will continue from where the first match ends. There are no more quotes in the rest of the string `subject:is one`, so no more results. That's probably not what we expected, but that's how it works. -**In the greedy mode (by default) the quantifier is repeated as many times as possible.** +**In the greedy mode (by default) a quantifier is repeated as many times as possible.** -The regexp engine tries to fetch as many characters as it can by `pattern:.+`, and then shortens that one by one. +The regexp engine adds to the match as many characters as it can for `pattern:.+`, and then shortens that one by one, if the rest of the pattern doesn't match. -For our task we want another thing. That's what the lazy quantifier mode is for. +For our task we want another thing. That's where a lazy mode can help. ## Lazy mode -The lazy mode of quantifier is an opposite to the greedy mode. It means: "repeat minimal number of times". +The lazy mode of quantifiers is an opposite to the greedy mode. It means: "repeat minimal number of times". We can enable it by putting a question mark `pattern:'?'` after the quantifier, so that it becomes `pattern:*?` or `pattern:+?` or even `pattern:??` for `pattern:'?'`. @@ -105,11 +105,11 @@ To make things clear: usually a question mark `pattern:?` is a quantifier by its The regexp `pattern:/".+?"/g` works as intended: it finds `match:"witch"` and `match:"broom"`: ```js run -let reg = /".+?"/g; +let regexp = /".+?"/g; let str = 'a "witch" and her "broom" is one'; -alert( str.match(reg) ); // witch, broom +alert( str.match(regexp) ); // witch, broom ``` To clearly understand the change, let's trace the search step by step. @@ -140,7 +140,7 @@ To clearly understand the change, let's trace the search step by step. ![](witch_lazy6.svg) -In this example we saw how the lazy mode works for `pattern:+?`. Quantifiers `pattern:+?` and `pattern:??` work the similar way -- the regexp engine increases the number of repetitions only if the rest of the pattern can't match on the given position. +In this example we saw how the lazy mode works for `pattern:+?`. Quantifiers `pattern:*?` and `pattern:??` work the similar way -- the regexp engine increases the number of repetitions only if the rest of the pattern can't match on the given position. **Laziness is only enabled for the quantifier with `?`.** @@ -149,20 +149,19 @@ Other quantifiers remain greedy. For instance: ```js run -alert( "123 456".match(/\d+ \d+?/g) ); // 123 4 +alert( "123 456".match(/\d+ \d+?/) ); // 123 4 ``` -1. The pattern `pattern:\d+` tries to match as many numbers as it can (greedy mode), so it finds `match:123` and stops, because the next character is a space `pattern:' '`. -2. Then there's a space in pattern, it matches. +1. The pattern `pattern:\d+` tries to match as many digits as it can (greedy mode), so it finds `match:123` and stops, because the next character is a space `pattern:' '`. +2. Then there's a space in the pattern, it matches. 3. Then there's `pattern:\d+?`. The quantifier is in lazy mode, so it finds one digit `match:4` and tries to check if the rest of the pattern matches from there. ...But there's nothing in the pattern after `pattern:\d+?`. The lazy mode doesn't repeat anything without a need. The pattern finished, so we're done. We have a match `match:123 4`. -4. The next search starts from the character `5`. ```smart header="Optimizations" -Modern regular expression engines can optimize internal algorithms to work faster. So they may work a bit different from the described algorithm. +Modern regular expression engines can optimize internal algorithms to work faster. So they may work a bit differently from the described algorithm. But to understand how regular expressions work and to build regular expressions, we don't need to know about that. They are only used internally to optimize things. @@ -176,11 +175,11 @@ With regexps, there's often more than one way to do the same thing. In our case we can find quoted strings without lazy mode using the regexp `pattern:"[^"]+"`: ```js run -let reg = /"[^"]+"/g; +let regexp = /"[^"]+"/g; let str = 'a "witch" and her "broom" is one'; -alert( str.match(reg) ); // witch, broom +alert( str.match(regexp) ); // witch, broom ``` The regexp `pattern:"[^"]+"` gives correct results, because it looks for a quote `pattern:'"'` followed by one or more non-quotes `pattern:[^"]`, and then the closing quote. @@ -202,20 +201,20 @@ The first idea might be: `pattern://g`. Let's check it: ```js run let str = '......'; -let reg = //g; +let regexp = //g; // Works! -alert( str.match(reg) ); // +alert( str.match(regexp) ); // ``` It worked. But let's see what happens if there are many links in the text? ```js run let str = '...... ...'; -let reg = //g; +let regexp = //g; // Whoops! Two links in one match! -alert( str.match(reg) ); // ... +alert( str.match(regexp) ); // ... ``` Now the result is wrong for the same reason as our "witches" example. The quantifier `pattern:.*` took too many characters. @@ -231,10 +230,10 @@ Let's modify the pattern by making the quantifier `pattern:.*?` lazy: ```js run let str = '...... ...'; -let reg = //g; +let regexp = //g; // Works! -alert( str.match(reg) ); // , +alert( str.match(regexp) ); // , ``` Now it seems to work, there are two matches: @@ -248,10 +247,10 @@ Now it seems to work, there are two matches: ```js run let str = '......

...'; -let reg = //g; +let regexp = //g; // Wrong match! -alert( str.match(reg) ); // ...

+alert( str.match(regexp) ); // ...

``` Now it fails. The match includes not just a link, but also a lot of text after it, including ``. @@ -264,7 +263,7 @@ That's what's going on: 2. Then it looks for `pattern:.*?`: takes one character (lazily!), check if there's a match for `pattern:" class="doc">` (none). 3. Then takes another character into `pattern:.*?`, and so on... until it finally reaches `match:" class="doc">`. -But the problem is: that's already beyond the link, in another tag `

`. Not what we want. +But the problem is: that's already beyond the link ``, in another tag `

`. Not what we want. Here's the picture of the match aligned with the text: @@ -273,22 +272,20 @@ Here's the picture of the match aligned with the text: ...

``` -So the laziness did not work for us here. +So, we need the pattern to look for ``, but both greedy and lazy variants have problems. -We need the pattern to look for ``, but both greedy and lazy variants have problems. - -The correct variant would be: `pattern:href="[^"]*"`. It will take all characters inside the `href` attribute till the nearest quote, just what we need. +The correct variant can be: `pattern:href="[^"]*"`. It will take all characters inside the `href` attribute till the nearest quote, just what we need. A working example: ```js run let str1 = '......

...'; let str2 = '...... ...'; -let reg = //g; +let regexp = //g; // Works! -alert( str1.match(reg) ); // null, no matches, that's correct -alert( str2.match(reg) ); // , +alert( str1.match(regexp) ); // null, no matches, that's correct +alert( str2.match(regexp) ); // , ``` ## Summary @@ -301,4 +298,4 @@ Greedy Lazy : Enabled by the question mark `pattern:?` after the quantifier. The regexp engine tries to match the rest of the pattern before each repetition of the quantifier. -As we've seen, the lazy mode is not a "panacea" from the greedy search. An alternative is a "fine-tuned" greedy search, with exclusions. Soon we'll see more examples of it. +As we've seen, the lazy mode is not a "panacea" from the greedy search. An alternative is a "fine-tuned" greedy search, with exclusions, as in the pattern `pattern:"[^"]+"`. diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg new file mode 100644 index 000000000..2eaf636cd --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg new file mode 100644 index 000000000..0489875a6 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg new file mode 100644 index 000000000..f5175e5c3 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg new file mode 100644 index 000000000..61b37fb9c --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg new file mode 100644 index 000000000..a0c5f1fb8 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg new file mode 100644 index 000000000..c7cc7537c --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg new file mode 100644 index 000000000..77d5d1562 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg new file mode 100644 index 000000000..6c9cc29cf --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg new file mode 100644 index 000000000..68c77d27d --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg new file mode 100644 index 000000000..2ee64f5b8 --- /dev/null +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg @@ -0,0 +1 @@ +a "witch" and her "broom" is one \ No newline at end of file diff --git a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md b/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md deleted file mode 100644 index e448a4b12..000000000 --- a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md +++ /dev/null @@ -1,23 +0,0 @@ - -Opening tag is `pattern:\[(b|url|quote)\]`. - -Then to find everything till the closing tag -- let's use the pattern `pattern:.*?` with flag `s` to match any character including the newline and then add a backreference to the closing tag. - -The full pattern: `pattern:\[(b|url|quote)\].*?\[/\1\]`. - -In action: - -```js run -let reg = /\[(b|url|quote)\].*?\[\/\1\]/gs; - -let str = ` - [b]hello![/b] - [quote] - [url]http://google.com[/url] - [/quote] -`; - -alert( str.match(reg) ); // [b]hello![/b],[quote][url]http://google.com[/url][/quote] -``` - -Please note that we had to escape a slash for the closing tag `pattern:[/\1]`, because normally the slash closes the pattern. diff --git a/9-regular-expressions/11-regexp-alternation/article.md b/9-regular-expressions/11-regexp-alternation/article.md deleted file mode 100644 index b26f7e4a6..000000000 --- a/9-regular-expressions/11-regexp-alternation/article.md +++ /dev/null @@ -1,59 +0,0 @@ -# Alternation (OR) | - -Alternation is the term in regular expression that is actually a simple "OR". - -In a regular expression it is denoted with a vertical line character `pattern:|`. - -For instance, we need to find programming languages: HTML, PHP, Java or JavaScript. - -The corresponding regexp: `pattern:html|php|java(script)?`. - -A usage example: - -```js run -let reg = /html|php|css|java(script)?/gi; - -let str = "First HTML appeared, then CSS, then JavaScript"; - -alert( str.match(reg) ); // 'HTML', 'CSS', 'JavaScript' -``` - -We already know a similar thing -- square brackets. They allow to choose between multiple character, for instance `pattern:gr[ae]y` matches `match:gray` or `match:grey`. - -Square brackets allow only characters or character sets. Alternation allows any expressions. A regexp `pattern:A|B|C` means one of expressions `A`, `B` or `C`. - -For instance: - -- `pattern:gr(a|e)y` means exactly the same as `pattern:gr[ae]y`. -- `pattern:gra|ey` means `match:gra` or `match:ey`. - -To separate a part of the pattern for alternation we usually enclose it in parentheses, like this: `pattern:before(XXX|YYY)after`. - -## Regexp for time - -In previous chapters there was a task to build a regexp for searching time in the form `hh:mm`, for instance `12:00`. But a simple `pattern:\d\d:\d\d` is too vague. It accepts `25:99` as the time (as 99 seconds match the pattern). - -How can we make a better one? - -We can apply more careful matching. First, the hours: - -- If the first digit is `0` or `1`, then the next digit can by anything. -- Or, if the first digit is `2`, then the next must be `pattern:[0-3]`. - -As a regexp: `pattern:[01]\d|2[0-3]`. - -Next, the minutes must be from `0` to `59`. In the regexp language that means `pattern:[0-5]\d`: the first digit `0-5`, and then any digit. - -Let's glue them together into the pattern: `pattern:[01]\d|2[0-3]:[0-5]\d`. - -We're almost done, but there's a problem. The alternation `pattern:|` now happens to be between `pattern:[01]\d` and `pattern:2[0-3]:[0-5]\d`. - -That's wrong, as it should be applied only to hours `[01]\d` OR `2[0-3]`. That's a common mistake when starting to work with regular expressions. - -The correct variant: - -```js run -let reg = /([01]\d|2[0-3]):[0-5]\d/g; - -alert("00:00 10:10 23:59 25:99 1:2".match(reg)); // 00:00,10:10,23:59 -``` diff --git a/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md b/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md new file mode 100644 index 000000000..26f7888f7 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md @@ -0,0 +1,21 @@ +A two-digit hex number is `pattern:[0-9a-f]{2}` (assuming the flag `pattern:i` is set). + +We need that number `NN`, and then `:NN` repeated 5 times (more numbers); + +The regexp is: `pattern:[0-9a-f]{2}(:[0-9a-f]{2}){5}` + +Now let's show that the match should capture all the text: start at the beginning and end at the end. That's done by wrapping the pattern in `pattern:^...$`. + +Finally: + +```js run +let regexp = /^[0-9a-fA-F]{2}(:[0-9a-fA-F]{2}){5}$/i; + +alert( regexp.test('01:32:54:67:89:AB') ); // true + +alert( regexp.test('0132546789AB') ); // false (no colons) + +alert( regexp.test('01:32:54:67:89') ); // false (5 numbers, need 6) + +alert( regexp.test('01:32:54:67:89:ZZ') ) // false (ZZ in the end) +``` diff --git a/9-regular-expressions/12-regexp-anchors/2-test-mac/task.md b/9-regular-expressions/11-regexp-groups/01-test-mac/task.md similarity index 50% rename from 9-regular-expressions/12-regexp-anchors/2-test-mac/task.md rename to 9-regular-expressions/11-regexp-groups/01-test-mac/task.md index e72655984..029a4803a 100644 --- a/9-regular-expressions/12-regexp-anchors/2-test-mac/task.md +++ b/9-regular-expressions/11-regexp-groups/01-test-mac/task.md @@ -8,13 +8,13 @@ Write a regexp that checks whether a string is MAC-address. Usage: ```js -let reg = /your regexp/; +let regexp = /your regexp/; -alert( reg.test('01:32:54:67:89:AB') ); // true +alert( regexp.test('01:32:54:67:89:AB') ); // true -alert( reg.test('0132546789AB') ); // false (no colons) +alert( regexp.test('0132546789AB') ); // false (no colons) -alert( reg.test('01:32:54:67:89') ); // false (5 numbers, must be 6) +alert( regexp.test('01:32:54:67:89') ); // false (5 numbers, must be 6) -alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ ad the end) +alert( regexp.test('01:32:54:67:89:ZZ') ) // false (ZZ ad the end) ``` diff --git a/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md new file mode 100644 index 000000000..0806dc4fd --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md @@ -0,0 +1,27 @@ +A regexp to search 3-digit color `#abc`: `pattern:/#[a-f0-9]{3}/i`. + +We can add exactly 3 more optional hex digits. We don't need more or less. The color has either 3 or 6 digits. + +Let's use the quantifier `pattern:{1,2}` for that: we'll have `pattern:/#([a-f0-9]{3}){1,2}/i`. + +Here the pattern `pattern:[a-f0-9]{3}` is enclosed in parentheses to apply the quantifier `pattern:{1,2}`. + +In action: + +```js run +let regexp = /#([a-f0-9]{3}){1,2}/gi; + +let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; + +alert( str.match(regexp) ); // #3f3 #AA00ef #abc +``` + +There's a minor problem here: the pattern found `match:#abc` in `subject:#abcd`. To prevent that we can add `pattern:\b` to the end: + +```js run +let regexp = /#([a-f0-9]{3}){1,2}\b/gi; + +let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; + +alert( str.match(regexp) ); // #3f3 #AA00ef +``` diff --git a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/task.md b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/task.md similarity index 59% rename from 9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/task.md rename to 9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/task.md index 4efd6f61f..09108484a 100644 --- a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/task.md +++ b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/task.md @@ -4,11 +4,11 @@ Write a RegExp that matches colors in the format `#abc` or `#abcdef`. That is: ` Usage example: ```js -let reg = /your regexp/g; +let regexp = /your regexp/g; let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; -alert( str.match(reg) ); // #3f3 #AA00ef +alert( str.match(regexp) ); // #3f3 #AA00ef ``` -P.S. This should be exactly 3 or 6 hex digits: values like `#abcd` should not match. +P.S. This should be exactly 3 or 6 hex digits. Values with 4 digits, such as `#abcd`, should not match. diff --git a/9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/solution.md b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/solution.md similarity index 51% rename from 9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/solution.md rename to 9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/solution.md index dd2410847..c4349f9a0 100644 --- a/9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/solution.md +++ b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/solution.md @@ -1,11 +1,11 @@ A positive number with an optional decimal part is (per previous task): `pattern:\d+(\.\d+)?`. -Let's add an optional `-` in the beginning: +Let's add the optional `pattern:-` in the beginning: ```js run -let reg = /-?\d+(\.\d+)?/g; +let regexp = /-?\d+(\.\d+)?/g; let str = "-1.5 0 2 -123.4."; -alert( str.match(reg) ); // -1.5, 0, 2, -123.4 +alert( str.match(regexp) ); // -1.5, 0, 2, -123.4 ``` diff --git a/9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/task.md b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/task.md similarity index 71% rename from 9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/task.md rename to 9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/task.md index 121a18a41..4f5a73fff 100644 --- a/9-regular-expressions/09-regexp-groups/4-find-decimal-numbers/task.md +++ b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/task.md @@ -5,9 +5,9 @@ Write a regexp that looks for all decimal numbers including integer ones, with t An example of use: ```js -let reg = /your regexp/g; +let regexp = /your regexp/g; let str = "-1.5 0 2 -123.4."; -alert( str.match(re) ); // -1.5, 0, 2, -123.4 +alert( str.match(regexp) ); // -1.5, 0, 2, -123.4 ``` diff --git a/9-regular-expressions/09-regexp-groups/5-parse-expression/solution.md b/9-regular-expressions/11-regexp-groups/04-parse-expression/solution.md similarity index 50% rename from 9-regular-expressions/09-regexp-groups/5-parse-expression/solution.md rename to 9-regular-expressions/11-regexp-groups/04-parse-expression/solution.md index 3db5f667c..130c57be3 100644 --- a/9-regular-expressions/09-regexp-groups/5-parse-expression/solution.md +++ b/9-regular-expressions/11-regexp-groups/04-parse-expression/solution.md @@ -1,21 +1,26 @@ A regexp for a number is: `pattern:-?\d+(\.\d+)?`. We created it in previous tasks. -An operator is `pattern:[-+*/]`. We put the dash `pattern:-` first, because in the middle it would mean a character range, we don't need that. +An operator is `pattern:[-+*/]`. The hyphen `pattern:-` goes first in the square brackets, because in the middle it would mean a character range, while we just want a character `-`. -Note that a slash should be escaped inside a JavaScript regexp `pattern:/.../`. +The slash `/` should be escaped inside a JavaScript regexp `pattern:/.../`, we'll do that later. We need a number, an operator, and then another number. And optional spaces between them. The full regular expression: `pattern:-?\d+(\.\d+)?\s*[-+*/]\s*-?\d+(\.\d+)?`. -To get a result as an array let's put parentheses around the data that we need: numbers and the operator: `pattern:(-?\d+(\.\d+)?)\s*([-+*/])\s*(-?\d+(\.\d+)?)`. +It has 3 parts, with `pattern:\s*` between them: +1. `pattern:-?\d+(\.\d+)?` - the first number, +1. `pattern:[-+*/]` - the operator, +1. `pattern:-?\d+(\.\d+)?` - the second number. + +To make each of these parts a separate element of the result array, let's enclose them in parentheses: `pattern:(-?\d+(\.\d+)?)\s*([-+*/])\s*(-?\d+(\.\d+)?)`. In action: ```js run -let reg = /(-?\d+(\.\d+)?)\s*([-+*\/])\s*(-?\d+(\.\d+)?)/; +let regexp = /(-?\d+(\.\d+)?)\s*([-+*\/])\s*(-?\d+(\.\d+)?)/; -alert( "1.2 + 12".match(reg) ); +alert( "1.2 + 12".match(regexp) ); ``` The result includes: @@ -27,19 +32,19 @@ The result includes: - `result[4] == "12"` (forth group `(-?\d+(\.\d+)?)` -- the second number) - `result[5] == undefined` (fifth group `(\.\d+)?` -- the last decimal part is absent, so it's undefined) -We only want the numbers and the operator, without the full match or the decimal parts. +We only want the numbers and the operator, without the full match or the decimal parts, so let's "clean" the result a bit. -The full match (the arrays first item) can be removed by shifting the array `pattern:result.shift()`. +The full match (the arrays first item) can be removed by shifting the array `result.shift()`. -The decimal groups can be removed by making them into non-capturing groups, by adding `pattern:?:` to the beginning: `pattern:(?:\.\d+)?`. +Groups that contain decimal parts (number 2 and 4) `pattern:(.\d+)` can be excluded by adding `pattern:?:` to the beginning: `pattern:(?:\.\d+)?`. The final solution: ```js run function parse(expr) { - let reg = /(-?\d+(?:\.\d+)?)\s*([-+*\/])\s*(-?\d+(?:\.\d+)?)/; + let regexp = /(-?\d+(?:\.\d+)?)\s*([-+*\/])\s*(-?\d+(?:\.\d+)?)/; - let result = expr.match(reg); + let result = expr.match(regexp); if (!result) return []; result.shift(); diff --git a/9-regular-expressions/09-regexp-groups/5-parse-expression/task.md b/9-regular-expressions/11-regexp-groups/04-parse-expression/task.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/5-parse-expression/task.md rename to 9-regular-expressions/11-regexp-groups/04-parse-expression/task.md diff --git a/9-regular-expressions/11-regexp-groups/article.md b/9-regular-expressions/11-regexp-groups/article.md new file mode 100644 index 000000000..ab25066d7 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/article.md @@ -0,0 +1,364 @@ +# Capturing groups + +A part of a pattern can be enclosed in parentheses `pattern:(...)`. This is called a "capturing group". + +That has two effects: + +1. It allows to get a part of the match as a separate item in the result array. +2. If we put a quantifier after the parentheses, it applies to the parentheses as a whole. + +## Examples + +Let's see how parentheses work in examples. + +### Example: gogogo + +Without parentheses, the pattern `pattern:go+` means `subject:g` character, followed by `subject:o` repeated one or more times. For instance, `match:goooo` or `match:gooooooooo`. + +Parentheses group characters together, so `pattern:(go)+` means `match:go`, `match:gogo`, `match:gogogo` and so on. + +```js run +alert( 'Gogogo now!'.match(/(go)+/i) ); // "Gogogo" +``` + +### Example: domain + +Let's make something more complex -- a regular expression to search for a website domain. + +For example: + +``` +mail.com +users.mail.com +smith.users.mail.com +``` + +As we can see, a domain consists of repeated words, a dot after each one except the last one. + +In regular expressions that's `pattern:(\w+\.)+\w+`: + +```js run +let regexp = /(\w+\.)+\w+/g; + +alert( "site.com my.site.com".match(regexp) ); // site.com,my.site.com +``` + +The search works, but the pattern can't match a domain with a hyphen, e.g. `my-site.com`, because the hyphen does not belong to class `pattern:\w`. + +We can fix it by replacing `pattern:\w` with `pattern:[\w-]` in every word except the last one: `pattern:([\w-]+\.)+\w+`. + +### Example: email + +The previous example can be extended. We can create a regular expression for emails based on it. + +The email format is: `name@domain`. Any word can be the name, hyphens and dots are allowed. In regular expressions that's `pattern:[-.\w]+`. + +The pattern: + +```js run +let regexp = /[-.\w]+@([\w-]+\.)+[\w-]+/g; + +alert("my@mail.com @ his@site.com.uk".match(regexp)); // my@mail.com, his@site.com.uk +``` + +That regexp is not perfect, but mostly works and helps to fix accidental mistypes. The only truly reliable check for an email can only be done by sending a letter. + +## Parentheses contents in the match + +Parentheses are numbered from left to right. The search engine memorizes the content matched by each of them and allows to get it in the result. + +The method `str.match(regexp)`, if `regexp` has no flag `g`, looks for the first match and returns it as an array: + +1. At index `0`: the full match. +2. At index `1`: the contents of the first parentheses. +3. At index `2`: the contents of the second parentheses. +4. ...and so on... + +For instance, we'd like to find HTML tags `pattern:<.*?>`, and process them. It would be convenient to have tag content (what's inside the angles), in a separate variable. + +Let's wrap the inner content into parentheses, like this: `pattern:<(.*?)>`. + +Now we'll get both the tag as a whole `match:

` and its contents `match:h1` in the resulting array: + +```js run +let str = '

Hello, world!

'; + +let tag = str.match(/<(.*?)>/); + +alert( tag[0] ); //

+alert( tag[1] ); // h1 +``` + +### Nested groups + +Parentheses can be nested. In this case the numbering also goes from left to right. + +For instance, when searching a tag in `subject:` we may be interested in: + +1. The tag content as a whole: `match:span class="my"`. +2. The tag name: `match:span`. +3. The tag attributes: `match:class="my"`. + +Let's add parentheses for them: `pattern:<(([a-z]+)\s*([^>]*))>`. + +Here's how they are numbered (left to right, by the opening paren): + +![](regexp-nested-groups-pattern.svg) + +In action: + +```js run +let str = ''; + +let regexp = /<(([a-z]+)\s*([^>]*))>/; + +let result = str.match(regexp); +alert(result[0]); // +alert(result[1]); // span class="my" +alert(result[2]); // span +alert(result[3]); // class="my" +``` + +The zero index of `result` always holds the full match. + +Then groups, numbered from left to right by an opening paren. The first group is returned as `result[1]`. Here it encloses the whole tag content. + +Then in `result[2]` goes the group from the second opening paren `pattern:([a-z]+)` - tag name, then in `result[3]` the tag: `pattern:([^>]*)`. + +The contents of every group in the string: + +![](regexp-nested-groups-matches.svg) + +### Optional groups + +Even if a group is optional and doesn't exist in the match (e.g. has the quantifier `pattern:(...)?`), the corresponding `result` array item is present and equals `undefined`. + +For instance, let's consider the regexp `pattern:a(z)?(c)?`. It looks for `"a"` optionally followed by `"z"` optionally followed by `"c"`. + +If we run it on the string with a single letter `subject:a`, then the result is: + +```js run +let match = 'a'.match(/a(z)?(c)?/); + +alert( match.length ); // 3 +alert( match[0] ); // a (whole match) +alert( match[1] ); // undefined +alert( match[2] ); // undefined +``` + +The array has the length of `3`, but all groups are empty. + +And here's a more complex match for the string `subject:ac`: + +```js run +let match = 'ac'.match(/a(z)?(c)?/) + +alert( match.length ); // 3 +alert( match[0] ); // ac (whole match) +alert( match[1] ); // undefined, because there's nothing for (z)? +alert( match[2] ); // c +``` + +The array length is permanent: `3`. But there's nothing for the group `pattern:(z)?`, so the result is `["ac", undefined, "c"]`. + +## Searching for all matches with groups: matchAll + +```warn header="`matchAll` is a new method, polyfill may be needed" +The method `matchAll` is not supported in old browsers. + +A polyfill may be required, such as . +``` + +When we search for all matches (flag `pattern:g`), the `match` method does not return contents for groups. + +For example, let's find all tags in a string: + +```js run +let str = '

'; + +let tags = str.match(/<(.*?)>/g); + +alert( tags ); //

,

+``` + +The result is an array of matches, but without details about each of them. But in practice we usually need contents of capturing groups in the result. + +To get them, we should search using the method `str.matchAll(regexp)`. + +It was added to JavaScript language long after `match`, as its "new and improved version". + +Just like `match`, it looks for matches, but there are 3 differences: + +1. It returns not an array, but an iterable object. +2. When the flag `pattern:g` is present, it returns every match as an array with groups. +3. If there are no matches, it returns not `null`, but an empty iterable object. + +For instance: + +```js run +let results = '

'.matchAll(/<(.*?)>/gi); + +// results - is not an array, but an iterable object +alert(results); // [object RegExp String Iterator] + +alert(results[0]); // undefined (*) + +results = Array.from(results); // let's turn it into array + +alert(results[0]); //

,h1 (1st tag) +alert(results[1]); //

,h2 (2nd tag) +``` + +As we can see, the first difference is very important, as demonstrated in the line `(*)`. We can't get the match as `results[0]`, because that object isn't pseudoarray. We can turn it into a real `Array` using `Array.from`. There are more details about pseudoarrays and iterables in the article . + +There's no need in `Array.from` if we're looping over results: + +```js run +let results = '

'.matchAll(/<(.*?)>/gi); + +for(let result of results) { + alert(result); + // первый вывод:

,h1 + // второй:

,h2 +} +``` + +...Or using destructuring: + +```js +let [tag1, tag2] = '

'.matchAll(/<(.*?)>/gi); +``` + +Every match, returned by `matchAll`, has the same format as returned by `match` without flag `pattern:g`: it's an array with additional properties `index` (match index in the string) and `input` (source string): + +```js run +let results = '

'.matchAll(/<(.*?)>/gi); + +let [tag1, tag2] = results; + +alert( tag1[0] ); //

+alert( tag1[1] ); // h1 +alert( tag1.index ); // 0 +alert( tag1.input ); //

+``` + +```smart header="Why is a result of `matchAll` an iterable object, not an array?" +Why is the method designed like that? The reason is simple - for the optimization. + +The call to `matchAll` does not perform the search. Instead, it returns an iterable object, without the results initially. The search is performed each time we iterate over it, e.g. in the loop. + +So, there will be found as many results as needed, not more. + +E.g. there are potentially 100 matches in the text, but in a `for..of` loop we found 5 of them, then decided it's enough and make a `break`. Then the engine won't spend time finding other 95 mathces. +``` + +## Named groups + +Remembering groups by their numbers is hard. For simple patterns it's doable, but for more complex ones counting parentheses is inconvenient. We have a much better option: give names to parentheses. + +That's done by putting `pattern:?` immediately after the opening paren. + +For example, let's look for a date in the format "year-month-day": + +```js run +*!* +let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; +*/!* +let str = "2019-04-30"; + +let groups = str.match(dateRegexp).groups; + +alert(groups.year); // 2019 +alert(groups.month); // 04 +alert(groups.day); // 30 +``` + +As you can see, the groups reside in the `.groups` property of the match. + +To look for all dates, we can add flag `pattern:g`. + +We'll also need `matchAll` to obtain full matches, together with groups: + +```js run +let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/g; + +let str = "2019-10-30 2020-01-01"; + +let results = str.matchAll(dateRegexp); + +for(let result of results) { + let {year, month, day} = result.groups; + + alert(`${day}.${month}.${year}`); + // first alert: 30.10.2019 + // second: 01.01.2020 +} +``` + +## Capturing groups in replacement + +Method `str.replace(regexp, replacement)` that replaces all matches with `regexp` in `str` allows to use parentheses contents in the `replacement` string. That's done using `pattern:$n`, where `pattern:n` is the group number. + +For example, + +```js run +let str = "John Bull"; +let regexp = /(\w+) (\w+)/; + +alert( str.replace(regexp, '$2, $1') ); // Bull, John +``` + +For named parentheses the reference will be `pattern:$`. + +For example, let's reformat dates from "year-month-day" to "day.month.year": + +```js run +let regexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/g; + +let str = "2019-10-30, 2020-01-01"; + +alert( str.replace(regexp, '$.$.$') ); +// 30.10.2019, 01.01.2020 +``` + +## Non-capturing groups with ?: + +Sometimes we need parentheses to correctly apply a quantifier, but we don't want their contents in results. + +A group may be excluded by adding `pattern:?:` in the beginning. + +For instance, if we want to find `pattern:(go)+`, but don't want the parentheses contents (`go`) as a separate array item, we can write: `pattern:(?:go)+`. + +In the example below we only get the name `match:John` as a separate member of the match: + +```js run +let str = "Gogogo John!"; + +*!* +// ?: exludes 'go' from capturing +let regexp = /(?:go)+ (\w+)/i; +*/!* + +let result = str.match(regexp); + +alert( result[0] ); // Gogogo John (full match) +alert( result[1] ); // John +alert( result.length ); // 2 (no more items in the array) +``` + +## Summary + +Parentheses group together a part of the regular expression, so that the quantifier applies to it as a whole. + +Parentheses groups are numbered left-to-right, and can optionally be named with `(?...)`. + +The content, matched by a group, can be obtained in the results: + +- The method `str.match` returns capturing groups only without flag `pattern:g`. +- The method `str.matchAll` always returns capturing groups. + +If the parentheses have no name, then their contents is available in the match array by its number. Named parentheses are also available in the property `groups`. + +We can also use parentheses contents in the replacement string in `str.replace`: by the number `$n` or the name `$`. + +A group may be excluded from numbering by adding `pattern:?:` in its start. That's used when we need to apply a quantifier to the whole group, but don't want it as a separate item in the results array. We also can't reference such parentheses in the replacement string. diff --git a/9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg new file mode 100644 index 000000000..ce61ff3a7 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg @@ -0,0 +1 @@ +< (( [a-z]+ ) \s* ( [^>]* )) >1span class="my"2span3class="my" \ No newline at end of file diff --git a/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg new file mode 100644 index 000000000..ce61ff3a7 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg @@ -0,0 +1 @@ +< (( [a-z]+ ) \s* ( [^>]* )) >1span class="my"2span3class="my" \ No newline at end of file diff --git a/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md b/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md deleted file mode 100644 index 422bc65e4..000000000 --- a/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md +++ /dev/null @@ -1,21 +0,0 @@ -A two-digit hex number is `pattern:[0-9a-f]{2}` (assuming the `pattern:i` flag is enabled). - -We need that number `NN`, and then `:NN` repeated 5 times (more numbers); - -The regexp is: `pattern:[0-9a-f]{2}(:[0-9a-f]{2}){5}` - -Now let's show that the match should capture all the text: start at the beginning and end at the end. That's done by wrapping the pattern in `pattern:^...$`. - -Finally: - -```js run -let reg = /^[0-9a-fA-F]{2}(:[0-9a-fA-F]{2}){5}$/i; - -alert( reg.test('01:32:54:67:89:AB') ); // true - -alert( reg.test('0132546789AB') ); // false (no colons) - -alert( reg.test('01:32:54:67:89') ); // false (5 numbers, need 6) - -alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ in the end) -``` diff --git a/9-regular-expressions/12-regexp-anchors/article.md b/9-regular-expressions/12-regexp-anchors/article.md deleted file mode 100644 index 0c2dd578a..000000000 --- a/9-regular-expressions/12-regexp-anchors/article.md +++ /dev/null @@ -1,55 +0,0 @@ -# String start ^ and finish $ - -The caret `pattern:'^'` and dollar `pattern:'$'` characters have special meaning in a regexp. They are called "anchors". - -The caret `pattern:^` matches at the beginning of the text, and the dollar `pattern:$` -- in the end. - -For instance, let's test if the text starts with `Mary`: - -```js run -let str1 = "Mary had a little lamb, it's fleece was white as snow"; -let str2 = 'Everywhere Mary went, the lamp was sure to go'; - -alert( /^Mary/.test(str1) ); // true -alert( /^Mary/.test(str2) ); // false -``` - -The pattern `pattern:^Mary` means: "the string start and then Mary". - -Now let's test whether the text ends with an email. - -To match an email, we can use a regexp `pattern:[-.\w]+@([\w-]+\.)+[\w-]{2,20}`. - -To test whether the string ends with the email, let's add `pattern:$` to the pattern: - -```js run -let reg = /[-.\w]+@([\w-]+\.)+[\w-]{2,20}$/g; - -let str1 = 'My email is mail@site.com'; -let str2 = 'Everywhere Mary went, the lamp was sure to go'; - -alert( reg.test(str1) ); // true -alert( reg.test(str2) ); // false -``` - -We can use both anchors together to check whether the string exactly follows the pattern. That's often used for validation. - -For instance we want to check that `str` is exactly a color in the form `#` plus 6 hex digits. The pattern for the color is `pattern:#[0-9a-f]{6}`. - -To check that the *whole string* exactly matches it, we add `pattern:^...$`: - -```js run -let str = "#abcdef"; - -alert( /^#[0-9a-f]{6}$/i.test(str) ); // true -``` - -The regexp engine looks for the text start, then the color, and then immediately the text end. Just what we need. - -```smart header="Anchors have zero length" -Anchors just like `\b` are tests. They have zero-width. - -In other words, they do not match a character, but rather force the regexp engine to check the condition (text start/end). -``` - -The behavior of anchors changes if there's a flag `pattern:m` (multiline mode). We'll explore it in the next chapter. diff --git a/9-regular-expressions/12-regexp-backreferences/article.md b/9-regular-expressions/12-regexp-backreferences/article.md new file mode 100644 index 000000000..83beb803a --- /dev/null +++ b/9-regular-expressions/12-regexp-backreferences/article.md @@ -0,0 +1,72 @@ +# Backreferences in pattern: \N and \k + +We can use the contents of capturing groups `pattern:(...)` not only in the result or in the replacement string, but also in the pattern itself. + +## Backreference by number: \N + +A group can be referenced in the pattern using `pattern:\N`, where `N` is the group number. + +To make clear why that's helpful, let's consider a task. + +We need to find quoted strings: either single-quoted `subject:'...'` or a double-quoted `subject:"..."` -- both variants should match. + +How to find them? + +We can put both kinds of quotes in the square brackets: `pattern:['"](.*?)['"]`, but it would find strings with mixed quotes, like `match:"...'` and `match:'..."`. That would lead to incorrect matches when one quote appears inside other ones, like in the string `subject:"She's the one!"`: + +```js run +let str = `He said: "She's the one!".`; + +let regexp = /['"](.*?)['"]/g; + +// The result is not what we'd like to have +alert( str.match(regexp) ); // "She' +``` + +As we can see, the pattern found an opening quote `match:"`, then the text is consumed till the other quote `match:'`, that closes the match. + +To make sure that the pattern looks for the closing quote exactly the same as the opening one, we can wrap it into a capturing group and backreference it: `pattern:(['"])(.*?)\1`. + +Here's the correct code: + +```js run +let str = `He said: "She's the one!".`; + +*!* +let regexp = /(['"])(.*?)\1/g; +*/!* + +alert( str.match(regexp) ); // "She's the one!" +``` + +Now it works! The regular expression engine finds the first quote `pattern:(['"])` and memorizes its content. That's the first capturing group. + +Further in the pattern `pattern:\1` means "find the same text as in the first group", exactly the same quote in our case. + +Similar to that, `pattern:\2` would mean the contents of the second group, `pattern:\3` - the 3rd group, and so on. + +```smart +If we use `?:` in the group, then we can't reference it. Groups that are excluded from capturing `(?:...)` are not memorized by the engine. +``` + +```warn header="Don't mess up: in the pattern `pattern:\1`, in the replacement: `pattern:$1`" +In the replacement string we use a dollar sign: `pattern:$1`, while in the pattern - a backslash `pattern:\1`. +``` + +## Backreference by name: `\k` + +If a regexp has many parentheses, it's convenient to give them names. + +To reference a named group we can use `pattern:\k<имя>`. + +In the example below the group with quotes is named `pattern:?`, so the backreference is `pattern:\k`: + +```js run +let str = `He said: "She's the one!".`; + +*!* +let regexp = /(?['"])(.*?)\k/g; +*/!* + +alert( str.match(regexp) ); // "She's the one!" +``` diff --git a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/solution.md b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/solution.md similarity index 79% rename from 9-regular-expressions/11-regexp-alternation/01-find-programming-language/solution.md rename to 9-regular-expressions/13-regexp-alternation/01-find-programming-language/solution.md index 3419aa498..e33f9cf2f 100644 --- a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/solution.md +++ b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/solution.md @@ -4,11 +4,11 @@ The first idea can be to list the languages with `|` in-between. But that doesn't work right: ```js run -let reg = /Java|JavaScript|PHP|C|C\+\+/g; +let regexp = /Java|JavaScript|PHP|C|C\+\+/g; let str = "Java, JavaScript, PHP, C, C++"; -alert( str.match(reg) ); // Java,Java,PHP,C,C +alert( str.match(regexp) ); // Java,Java,PHP,C,C ``` The regular expression engine looks for alternations one-by-one. That is: first it checks if we have `match:Java`, otherwise -- looks for `match:JavaScript` and so on. @@ -25,9 +25,9 @@ There are two solutions for that problem: In action: ```js run -let reg = /Java(Script)?|C(\+\+)?|PHP/g; +let regexp = /Java(Script)?|C(\+\+)?|PHP/g; let str = "Java, JavaScript, PHP, C, C++"; -alert( str.match(reg) ); // Java,JavaScript,PHP,C,C++ +alert( str.match(regexp) ); // Java,JavaScript,PHP,C,C++ ``` diff --git a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/task.md b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/task.md similarity index 65% rename from 9-regular-expressions/11-regexp-alternation/01-find-programming-language/task.md rename to 9-regular-expressions/13-regexp-alternation/01-find-programming-language/task.md index 61b9526f7..e0f7af95c 100644 --- a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/task.md +++ b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/task.md @@ -5,7 +5,7 @@ There are many programming languages, for instance Java, JavaScript, PHP, C, C++ Create a regexp that finds them in the string `subject:Java JavaScript PHP C++ C`: ```js -let reg = /your regexp/g; +let regexp = /your regexp/g; -alert("Java JavaScript PHP C++ C".match(reg)); // Java JavaScript PHP C++ C +alert("Java JavaScript PHP C++ C".match(regexp)); // Java JavaScript PHP C++ C ``` diff --git a/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md new file mode 100644 index 000000000..9b3fa1877 --- /dev/null +++ b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md @@ -0,0 +1,23 @@ + +Opening tag is `pattern:\[(b|url|quote)\]`. + +Then to find everything till the closing tag -- let's use the pattern `pattern:.*?` with flag `pattern:s` to match any character including the newline and then add a backreference to the closing tag. + +The full pattern: `pattern:\[(b|url|quote)\].*?\[/\1\]`. + +In action: + +```js run +let regexp = /\[(b|url|quote)\].*?\[\/\1\]/gs; + +let str = ` + [b]hello![/b] + [quote] + [url]http://google.com[/url] + [/quote] +`; + +alert( str.match(regexp) ); // [b]hello![/b],[quote][url]http://google.com[/url][/quote] +``` + +Please note that besides escaping `pattern:[` and `pattern:]`, we had to escape a slash for the closing tag `pattern:[\/\1]`, because normally the slash closes the pattern. diff --git a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/task.md b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/task.md similarity index 78% rename from 9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/task.md rename to 9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/task.md index 8cc59deb3..72d715afd 100644 --- a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/task.md +++ b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/task.md @@ -15,7 +15,7 @@ Normal: [url] [b]http://google.com[/b] [/url] [quote] [b]text[/b] [/quote] -Impossible: +Can't happen: [b][b]text[/b][/b] ``` @@ -32,17 +32,17 @@ Create a regexp to find all BB-tags with their contents. For instance: ```js -let reg = /your regexp/flags; +let regexp = /your regexp/flags; let str = "..[url]http://google.com[/url].."; -alert( str.match(reg) ); // [url]http://google.com[/url] +alert( str.match(regexp) ); // [url]http://google.com[/url] ``` If tags are nested, then we need the outer tag (if we want we can continue the search in its content): ```js -let reg = /your regexp/flags; +let regexp = /your regexp/flags; let str = "..[url][b]http://google.com[/b][/url].."; -alert( str.match(reg) ); // [url][b]http://google.com[/b][/url] +alert( str.match(regexp) ); // [url][b]http://google.com[/b][/url] ``` diff --git a/9-regular-expressions/11-regexp-alternation/03-match-quoted-string/solution.md b/9-regular-expressions/13-regexp-alternation/03-match-quoted-string/solution.md similarity index 87% rename from 9-regular-expressions/11-regexp-alternation/03-match-quoted-string/solution.md rename to 9-regular-expressions/13-regexp-alternation/03-match-quoted-string/solution.md index 143be870c..5a007aee0 100644 --- a/9-regular-expressions/11-regexp-alternation/03-match-quoted-string/solution.md +++ b/9-regular-expressions/13-regexp-alternation/03-match-quoted-string/solution.md @@ -10,8 +10,8 @@ Step by step: In action: ```js run -let reg = /"(\\.|[^"\\])*"/g; +let regexp = /"(\\.|[^"\\])*"/g; let str = ' .. "test me" .. "Say \\"Hello\\"!" .. "\\\\ \\"" .. '; -alert( str.match(reg) ); // "test me","Say \"Hello\"!","\\ \"" +alert( str.match(regexp) ); // "test me","Say \"Hello\"!","\\ \"" ``` diff --git a/9-regular-expressions/11-regexp-alternation/03-match-quoted-string/task.md b/9-regular-expressions/13-regexp-alternation/03-match-quoted-string/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/03-match-quoted-string/task.md rename to 9-regular-expressions/13-regexp-alternation/03-match-quoted-string/task.md diff --git a/9-regular-expressions/11-regexp-alternation/04-match-exact-tag/solution.md b/9-regular-expressions/13-regexp-alternation/04-match-exact-tag/solution.md similarity index 72% rename from 9-regular-expressions/11-regexp-alternation/04-match-exact-tag/solution.md rename to 9-regular-expressions/13-regexp-alternation/04-match-exact-tag/solution.md index 70c4de91a..5d4ba8d96 100644 --- a/9-regular-expressions/11-regexp-alternation/04-match-exact-tag/solution.md +++ b/9-regular-expressions/13-regexp-alternation/04-match-exact-tag/solution.md @@ -10,7 +10,7 @@ In the regexp language: `pattern:|\s.*?>)`. In action: ```js run -let reg = /|\s.*?>)/g; +let regexp = /|\s.*?>)/g; -alert( '