From 7d7258d641c0603a5b0265a3577a4bfff0982c3a Mon Sep 17 00:00:00 2001 From: Kyrela Date: Sat, 16 Dec 2023 23:25:03 +0100 Subject: [PATCH 1/6] Relative imports --- discord_markdown_ast_parser/__init__.py | 4 ++-- discord_markdown_ast_parser/parser.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/discord_markdown_ast_parser/__init__.py b/discord_markdown_ast_parser/__init__.py index 608dbe8..02a4994 100644 --- a/discord_markdown_ast_parser/__init__.py +++ b/discord_markdown_ast_parser/__init__.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional -from lexer import lex, Lexing -from parser import Node, parse_tokens +from .lexer import lex, Lexing +from .parser import Node, parse_tokens def lexing_list_convert(lexing: Lexing) -> List[Lexing]: diff --git a/discord_markdown_ast_parser/parser.py b/discord_markdown_ast_parser/parser.py index cb814d2..34f1eff 100644 --- a/discord_markdown_ast_parser/parser.py +++ b/discord_markdown_ast_parser/parser.py @@ -4,7 +4,7 @@ import itertools from typing import Optional, Generator, Any, List, Dict, Tuple, Iterable -from lexer import Token, LexingRule, Lexing +from .lexer import Token, LexingRule, Lexing NodeType = Enum( From a23c1227cbe4147eadb65c4722085a4149a7767e Mon Sep 17 00:00:00 2001 From: Kyrela Date: Sat, 16 Dec 2023 23:25:23 +0100 Subject: [PATCH 2/6] Better Typehints --- discord_markdown_ast_parser/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discord_markdown_ast_parser/__init__.py b/discord_markdown_ast_parser/__init__.py index 02a4994..1c7f83c 100644 --- a/discord_markdown_ast_parser/__init__.py +++ b/discord_markdown_ast_parser/__init__.py @@ -1,10 +1,10 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Union from .lexer import lex, Lexing from .parser import Node, parse_tokens -def lexing_list_convert(lexing: Lexing) -> List[Lexing]: +def lexing_list_convert(lexing: Union[List[Lexing], Lexing]) -> List[Lexing]: if not isinstance(lexing, list): lexing = [lexing] return [Lexing(item) if isinstance(item, str) else item for item in lexing] From 182b4440b0fd7ba639afcdc0cebeb371f2180eb6 Mon Sep 17 00:00:00 2001 From: Kyrela Date: Sun, 17 Dec 2023 15:30:18 +0100 Subject: [PATCH 3/6] Missing NodeTypes (EMOJI_CUSTOM_ANIMATED, TIMESTAMP) --- discord_markdown_ast_parser/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/discord_markdown_ast_parser/parser.py b/discord_markdown_ast_parser/parser.py index 34f1eff..5386727 100644 --- a/discord_markdown_ast_parser/parser.py +++ b/discord_markdown_ast_parser/parser.py @@ -21,12 +21,14 @@ "CHANNEL", "SLASH_COMMAND", "EMOJI_CUSTOM", + "EMOJI_CUSTOM_ANIMATED", "EMOJI_UNICODE", "EMOJI_UNICODE_ENCODED", "URL_WITH_PREVIEW_EMBEDDED", "URL_WITHOUT_PREVIEW_EMBEDDED", "URL_WITH_PREVIEW", "URL_WITHOUT_PREVIEW", + "TIMESTAMP", "QUOTE_BLOCK", "CODE_BLOCK", "CODE_INLINE", From 9eb841b2b4a1a6c95f72b5a90421f372fdf995ab Mon Sep 17 00:00:00 2001 From: Kyrela Date: Mon, 24 Jun 2024 12:30:57 +0200 Subject: [PATCH 4/6] Better url regex (not vulnerable to ReDoS) --- discord_markdown_ast_parser/lexer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/discord_markdown_ast_parser/lexer.py b/discord_markdown_ast_parser/lexer.py index d890243..34c7453 100644 --- a/discord_markdown_ast_parser/lexer.py +++ b/discord_markdown_ast_parser/lexer.py @@ -15,10 +15,9 @@ def __call__(self, text: str): def __repr__(self): return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})" -# stolen from https://www.urlregex.com/ -URL_REGEX = ( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" -) + +URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" + class LexingRule(Lexing, Enum): USER_MENTION = r"<@!?(\d{15,20})>" From 8d17721fe298740171139140abd2e6d9b24b30eb Mon Sep 17 00:00:00 2001 From: Kyrela Date: Mon, 24 Jun 2024 12:33:59 +0200 Subject: [PATCH 5/6] Corrected style code --- discord_markdown_ast_parser/lexer.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/discord_markdown_ast_parser/lexer.py b/discord_markdown_ast_parser/lexer.py index 34c7453..b566798 100644 --- a/discord_markdown_ast_parser/lexer.py +++ b/discord_markdown_ast_parser/lexer.py @@ -1,5 +1,5 @@ import re -from dataclasses import dataclass, InitVar, field +from dataclasses import dataclass, field from enum import Enum from typing import Optional, List, Generator, Dict import itertools @@ -8,10 +8,10 @@ class Lexing: def __init__(self, pattern: Optional[str] = None, flags: re.RegexFlag = re.NOFLAG): self.regex = re.compile(pattern, flags=flags) if pattern else None - + def __call__(self, text: str): return self.regex and self.regex.match(text) - + def __repr__(self): return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})" @@ -22,16 +22,16 @@ def __repr__(self): class LexingRule(Lexing, Enum): USER_MENTION = r"<@!?(\d{15,20})>" ROLE_MENTION = r"<@&(\d{15,20})>" - SLASH_COMMAND_MENTION = r"" + SLASH_COMMAND_MENTION = r"" CHANNEL_MENTION = r"<#(\d{15,20})>" TIMESTAMP = r"" EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>" EMOJI_CUSTOM_ANIMATED = r"" EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])" EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):" - URL_WITHOUT_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(<({URL_REGEX})>\)" - URL_WITH_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(({URL_REGEX})\)" - URL_WITHOUT_PREVIEW = f"<{URL_REGEX}>" + URL_WITHOUT_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(<({URL_REGEX})>\)" + URL_WITH_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(({URL_REGEX})\)" + URL_WITHOUT_PREVIEW = fr"<{URL_REGEX}>" URL_WITH_PREVIEW = URL_REGEX QUOTE_LINE_PREFIX = r"(>>)?> " TILDE = r"~" @@ -49,12 +49,11 @@ class Token: value: str = "" lexing_rule: Lexing = LexingRule.TEXT_INLINE groups: List[str] = field(default_factory=list) - + def __contains__(self, rule: Lexing): return self.lexing_rule == rule - def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Generator[Token, None, None]: """Lexes the input text and returns a generator of tokens. The generator will yield a token for each lexing rule that matches the input text. @@ -67,7 +66,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge """ seen_simple_text = "" custom = custom or {} - + while input_text: for rule in itertools.chain(*custom.values(), LexingRule): match = rule(input_text) @@ -80,7 +79,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge continue # don't yield a token in this run # cut off matched part - input_text = input_text[len(match[0]) :] + input_text = input_text[len(match[0]):] # yield inline text if we have some left if len(seen_simple_text) > 0: From 4d6ddcf4b70642a02277c96ae3e4d0f9a5398ea0 Mon Sep 17 00:00:00 2001 From: Kyrela Date: Mon, 24 Jun 2024 16:27:28 +0200 Subject: [PATCH 6/6] Match one-letter domains --- discord_markdown_ast_parser/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discord_markdown_ast_parser/lexer.py b/discord_markdown_ast_parser/lexer.py index b566798..fbfbc32 100644 --- a/discord_markdown_ast_parser/lexer.py +++ b/discord_markdown_ast_parser/lexer.py @@ -16,7 +16,7 @@ def __repr__(self): return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})" -URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" +URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" class LexingRule(Lexing, Enum):