Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions discord_markdown_ast_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Union

from lexer import lex, Lexing
from parser import Node, parse_tokens
from .lexer import lex, Lexing
from .parser import Node, parse_tokens


def lexing_list_convert(lexing: Lexing) -> List[Lexing]:
def lexing_list_convert(lexing: Union[List[Lexing], Lexing]) -> List[Lexing]:
if not isinstance(lexing, list):
lexing = [lexing]
return [Lexing(item) if isinstance(item, str) else item for item in lexing]
Expand Down
28 changes: 13 additions & 15 deletions discord_markdown_ast_parser/lexer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from dataclasses import dataclass, InitVar, field
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional, List, Generator, Dict
import itertools
Expand All @@ -8,31 +8,30 @@
class Lexing:
def __init__(self, pattern: Optional[str] = None, flags: re.RegexFlag = re.NOFLAG):
self.regex = re.compile(pattern, flags=flags) if pattern else None

def __call__(self, text: str):
return self.regex and self.regex.match(text)

def __repr__(self):
return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})"

# stolen from https://www.urlregex.com/
URL_REGEX = (
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)

URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"


class LexingRule(Lexing, Enum):
USER_MENTION = r"<@!?(\d{15,20})>"
ROLE_MENTION = r"<@&(\d{15,20})>"
SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
CHANNEL_MENTION = r"<#(\d{15,20})>"
TIMESTAMP = r"<t:(-?\d+)(?::([tTdDfFR]))?>"
EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
EMOJI_CUSTOM_ANIMATED = r"<a:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])"
EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):"
URL_WITHOUT_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(<({URL_REGEX})>\)"
URL_WITH_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(({URL_REGEX})\)"
URL_WITHOUT_PREVIEW = f"<{URL_REGEX}>"
URL_WITHOUT_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(<({URL_REGEX})>\)"
URL_WITH_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(({URL_REGEX})\)"
URL_WITHOUT_PREVIEW = fr"<{URL_REGEX}>"
URL_WITH_PREVIEW = URL_REGEX
QUOTE_LINE_PREFIX = r"(>>)?> "
TILDE = r"~"
Expand All @@ -50,12 +49,11 @@ class Token:
value: str = ""
lexing_rule: Lexing = LexingRule.TEXT_INLINE
groups: List[str] = field(default_factory=list)

def __contains__(self, rule: Lexing):
return self.lexing_rule == rule



def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Generator[Token, None, None]:
"""Lexes the input text and returns a generator of tokens.
The generator will yield a token for each lexing rule that matches the input text.
Expand All @@ -68,7 +66,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
"""
seen_simple_text = ""
custom = custom or {}

while input_text:
for rule in itertools.chain(*custom.values(), LexingRule):
match = rule(input_text)
Expand All @@ -81,7 +79,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
continue # don't yield a token in this run

# cut off matched part
input_text = input_text[len(match[0]) :]
input_text = input_text[len(match[0]):]

# yield inline text if we have some left
if len(seen_simple_text) > 0:
Expand Down
4 changes: 3 additions & 1 deletion discord_markdown_ast_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import itertools
from typing import Optional, Generator, Any, List, Dict, Tuple, Iterable

from lexer import Token, LexingRule, Lexing
from .lexer import Token, LexingRule, Lexing


NodeType = Enum(
Expand All @@ -21,12 +21,14 @@
"CHANNEL",
"SLASH_COMMAND",
"EMOJI_CUSTOM",
"EMOJI_CUSTOM_ANIMATED",
"EMOJI_UNICODE",
"EMOJI_UNICODE_ENCODED",
"URL_WITH_PREVIEW_EMBEDDED",
"URL_WITHOUT_PREVIEW_EMBEDDED",
"URL_WITH_PREVIEW",
"URL_WITHOUT_PREVIEW",
"TIMESTAMP",
"QUOTE_BLOCK",
"CODE_BLOCK",
"CODE_INLINE",
Expand Down