11import string
2+ from dataclasses import dataclass
23
34
45def create_range (start_char : str , end_char : str ) -> set [str ]:
@@ -8,13 +9,99 @@ def create_range(start_char: str, end_char: str) -> set[str]:
89 return {chr (i ) for i in range (ord (start_char ), ord (end_char ) + 1 )}
910
1011
12+ # TODO singleton
1113LATIN_EXT_CHARS = create_range ("¡" , "ƿ" )
1214GREEK_CHARS = create_range ("Ͱ" , "Ͽ" )
1315CYRILLIC_CHARS = create_range ("Ѐ" , "ӿ" )
14- ASCII_CHARS = set (string .printable )
15- ALL_CHARS = ASCII_CHARS .union (LATIN_EXT_CHARS ).union (GREEK_CHARS ).union (CYRILLIC_CHARS )
16- SUPPORTED_CHARS = ASCII_CHARS
16+ ASCII = set (string .printable )
17+ CONTROLLED_UTF8_CHARS = (
18+ ASCII .union (LATIN_EXT_CHARS ).union (GREEK_CHARS ).union (CYRILLIC_CHARS )
19+ )
20+ UNCONTROLLED_UTF8_CHARS = {
21+ chr (codepoint )
22+ for codepoint in range (0x110000 )
23+ if not (0xD800 <= codepoint <= 0xDFFF )
24+ }
25+
1726# All supported characters and escape all the regex characters that need to be escaped
1827ESCAPE_CHARS = ["\\ " , "^" , "$" , "." , "|" , "?" , "*" , "+" , "()" , "[]" , "{" , "}" ]
1928ESCAPED_CHARS = [f"\\ { c } " for c in ESCAPE_CHARS ]
20- SUPPORTED_ESCAPE_CHARS = ASCII_CHARS .difference (ESCAPE_CHARS ).union (ESCAPED_CHARS )
29+
30+
31+ @dataclass
32+ class SupportedChars :
33+ all_chars : set [str ]
34+ non_escaped_chars : set [str ]
35+ including_escaped_chars : set [str ]
36+
37+
38+ ASCII_CHARS = SupportedChars (
39+ all_chars = ASCII ,
40+ non_escaped_chars = ASCII .difference (ESCAPE_CHARS ),
41+ including_escaped_chars = ASCII .difference (ESCAPE_CHARS ).union (ESCAPED_CHARS ),
42+ )
43+
44+ CONTROLLED_UTF8_CHARS = SupportedChars (
45+ all_chars = CONTROLLED_UTF8_CHARS ,
46+ non_escaped_chars = CONTROLLED_UTF8_CHARS .difference (ESCAPE_CHARS ),
47+ including_escaped_chars = CONTROLLED_UTF8_CHARS .difference (ESCAPE_CHARS ).union (
48+ ESCAPED_CHARS
49+ ),
50+ )
51+
52+ UNCONTROLLED_UTF8_CHARS = SupportedChars (
53+ all_chars = UNCONTROLLED_UTF8_CHARS ,
54+ non_escaped_chars = UNCONTROLLED_UTF8_CHARS .difference (ESCAPE_CHARS ),
55+ including_escaped_chars = UNCONTROLLED_UTF8_CHARS .difference (ESCAPE_CHARS ).union (
56+ ESCAPED_CHARS
57+ ),
58+ )
59+
60+
61+ class SupportedCharsManager :
62+ """Singleton for supported characters."""
63+
64+ _instance = None
65+
66+ def __new__ (cls , char_set = "ascii" ):
67+ if cls ._instance is None :
68+ cls ._instance = super (SupportedCharsManager , cls ).__new__ (cls )
69+ cls ._instance .chars = None # Initialize the attribute
70+ cls ._instance ._set_chars (char_set )
71+
72+ return cls ._instance
73+
74+ def _set_chars (self , char_set ):
75+ """Set the character set based on the provided name."""
76+ if char_set == "ascii" :
77+ self .chars = ASCII_CHARS
78+ elif char_set == "controlled_utf8" :
79+ self .chars = CONTROLLED_UTF8_CHARS
80+ elif char_set == "uncontrolled_utf8" :
81+ self .chars = UNCONTROLLED_UTF8_CHARS
82+ else :
83+ raise ValueError (f"Invalid character set: { char_set } " )
84+
85+ def get_chars (self ):
86+ """Get the supported characters."""
87+ return self .chars
88+
89+ @classmethod
90+ def override (cls , char_set ):
91+ """
92+ Override the character set of the singleton instance.
93+ If the instance doesn't exist, it will be created.
94+
95+ Args:
96+ char_set: The name of the character set to use
97+
98+ Returns:
99+ The singleton instance
100+ """
101+ # Create the instance if it doesn't exist
102+ if cls ._instance is None :
103+ return cls (char_set )
104+
105+ # Override the existing instance's character set
106+ cls ._instance ._set_chars (char_set )
107+ return cls ._instance
0 commit comments