Skip to content

Commit 44016f3

Browse files
Add support for utf-8 chars
Closes #24 Example command: python src/zkregex_fuzzer/cli.py fuzz \ --oracle combined \ --target noir \ --invalid-input-generator mixed --valid-input-generator mixed \ --fuzzer grammar --grammar-custom-grammar controlled_utf8 \ --process-num 9 --save INVALID_SEED COMPILE_ERROR RUN_ERROR FAILED SUBSTR_MISMATCH --inputs-num 10 --regex-num 10
1 parent 2196f60 commit 44016f3

File tree

10 files changed

+234
-85
lines changed

10 files changed

+234
-85
lines changed

src/zkregex_fuzzer/chars.py

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import string
2+
from dataclasses import dataclass
23

34

45
def create_range(start_char: str, end_char: str) -> set[str]:
@@ -8,13 +9,99 @@ def create_range(start_char: str, end_char: str) -> set[str]:
89
return {chr(i) for i in range(ord(start_char), ord(end_char) + 1)}
910

1011

12+
# TODO singleton
1113
LATIN_EXT_CHARS = create_range("¡", "ƿ")
1214
GREEK_CHARS = create_range("Ͱ", "Ͽ")
1315
CYRILLIC_CHARS = create_range("Ѐ", "ӿ")
14-
ASCII_CHARS = set(string.printable)
15-
ALL_CHARS = ASCII_CHARS.union(LATIN_EXT_CHARS).union(GREEK_CHARS).union(CYRILLIC_CHARS)
16-
SUPPORTED_CHARS = ASCII_CHARS
16+
ASCII = set(string.printable)
17+
CONTROLLED_UTF8_CHARS = (
18+
ASCII.union(LATIN_EXT_CHARS).union(GREEK_CHARS).union(CYRILLIC_CHARS)
19+
)
20+
UNCONTROLLED_UTF8_CHARS = {
21+
chr(codepoint)
22+
for codepoint in range(0x110000)
23+
if not (0xD800 <= codepoint <= 0xDFFF)
24+
}
25+
1726
# All supported characters and escape all the regex characters that need to be escaped
1827
ESCAPE_CHARS = ["\\", "^", "$", ".", "|", "?", "*", "+", "()", "[]", "{", "}"]
1928
ESCAPED_CHARS = [f"\\{c}" for c in ESCAPE_CHARS]
20-
SUPPORTED_ESCAPE_CHARS = ASCII_CHARS.difference(ESCAPE_CHARS).union(ESCAPED_CHARS)
29+
30+
31+
@dataclass
32+
class SupportedChars:
33+
all_chars: set[str]
34+
non_escaped_chars: set[str]
35+
including_escaped_chars: set[str]
36+
37+
38+
ASCII_CHARS = SupportedChars(
39+
all_chars=ASCII,
40+
non_escaped_chars=ASCII.difference(ESCAPE_CHARS),
41+
including_escaped_chars=ASCII.difference(ESCAPE_CHARS).union(ESCAPED_CHARS),
42+
)
43+
44+
CONTROLLED_UTF8_CHARS = SupportedChars(
45+
all_chars=CONTROLLED_UTF8_CHARS,
46+
non_escaped_chars=CONTROLLED_UTF8_CHARS.difference(ESCAPE_CHARS),
47+
including_escaped_chars=CONTROLLED_UTF8_CHARS.difference(ESCAPE_CHARS).union(
48+
ESCAPED_CHARS
49+
),
50+
)
51+
52+
UNCONTROLLED_UTF8_CHARS = SupportedChars(
53+
all_chars=UNCONTROLLED_UTF8_CHARS,
54+
non_escaped_chars=UNCONTROLLED_UTF8_CHARS.difference(ESCAPE_CHARS),
55+
including_escaped_chars=UNCONTROLLED_UTF8_CHARS.difference(ESCAPE_CHARS).union(
56+
ESCAPED_CHARS
57+
),
58+
)
59+
60+
61+
class SupportedCharsManager:
62+
"""Singleton for supported characters."""
63+
64+
_instance = None
65+
66+
def __new__(cls, char_set="ascii"):
67+
if cls._instance is None:
68+
cls._instance = super(SupportedCharsManager, cls).__new__(cls)
69+
cls._instance.chars = None # Initialize the attribute
70+
cls._instance._set_chars(char_set)
71+
72+
return cls._instance
73+
74+
def _set_chars(self, char_set):
75+
"""Set the character set based on the provided name."""
76+
if char_set == "ascii":
77+
self.chars = ASCII_CHARS
78+
elif char_set == "controlled_utf8":
79+
self.chars = CONTROLLED_UTF8_CHARS
80+
elif char_set == "uncontrolled_utf8":
81+
self.chars = UNCONTROLLED_UTF8_CHARS
82+
else:
83+
raise ValueError(f"Invalid character set: {char_set}")
84+
85+
def get_chars(self):
86+
"""Get the supported characters."""
87+
return self.chars
88+
89+
@classmethod
90+
def override(cls, char_set):
91+
"""
92+
Override the character set of the singleton instance.
93+
If the instance doesn't exist, it will be created.
94+
95+
Args:
96+
char_set: The name of the character set to use
97+
98+
Returns:
99+
The singleton instance
100+
"""
101+
# Create the instance if it doesn't exist
102+
if cls._instance is None:
103+
return cls(char_set)
104+
105+
# Override the existing instance's character set
106+
cls._instance._set_chars(char_set)
107+
return cls._instance

src/zkregex_fuzzer/cli.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import uuid
99
from pathlib import Path
1010

11+
from zkregex_fuzzer.chars import SupportedCharsManager
1112
from zkregex_fuzzer.configs import (
1213
DEFAULT_HARNESS_TIMEOUT,
1314
DEFAULT_INPUT_GEN_TIMEOUT,
@@ -178,6 +179,12 @@ def fuzz_parser():
178179
default=DEFAULT_HARNESS_TIMEOUT,
179180
help="Timeout for harness execution (default: 300).",
180181
)
182+
parser.add_argument(
183+
"--char-set",
184+
choices=["ascii", "controlled_utf8", "uncontrolled_utf8"],
185+
default="ascii",
186+
help="The character set to use for the fuzzer (default: ascii).",
187+
)
181188
return parser
182189

183190

@@ -283,6 +290,7 @@ def do_fuzz(args):
283290
logging_file=logging_file,
284291
output_path=args.save_output,
285292
save_options=args.save,
293+
char_set=args.char_set,
286294
)
287295

288296
# Use the new reporting function to print configuration
@@ -384,6 +392,15 @@ def main():
384392

385393
logger.setLevel(args.logger_level)
386394

395+
# set supported chars singleton
396+
if args.grammar_custom_grammar == "basic":
397+
args.char_set = "ascii"
398+
elif args.grammar_custom_grammar == "controlled_utf8":
399+
args.char_set = "controlled_utf8"
400+
elif args.grammar_custom_grammar == "uncontrolled_utf8":
401+
args.char_set = "uncontrolled_utf8"
402+
SupportedCharsManager(args.char_set)
403+
387404
if args.subcommand == "fuzz":
388405
do_fuzz(args)
389406
elif args.subcommand == "reproduce":

src/zkregex_fuzzer/configs.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
from zkregex_fuzzer.grammar import BASIC_REGEX_GRAMMAR, OLD_GRAMMAR
1+
from zkregex_fuzzer.grammar import (
2+
BASIC_REGEX_GRAMMAR,
3+
CONTROLLED_UTF8_GRAMMAR,
4+
OLD_GRAMMAR,
5+
UNCONTROLLED_UTF8_GRAMMAR,
6+
)
27
from zkregex_fuzzer.invinpgen import (
38
ComplementBasedGenerator,
49
MutationBasedGenerator,
@@ -36,6 +41,8 @@
3641
GRAMMARS = {
3742
"basic": BASIC_REGEX_GRAMMAR,
3843
"old": OLD_GRAMMAR,
44+
"controlled_utf8": CONTROLLED_UTF8_GRAMMAR,
45+
"uncontrolled_utf8": UNCONTROLLED_UTF8_GRAMMAR,
3946
}
4047

4148
VALID_INPUT_GENERATORS = {

src/zkregex_fuzzer/dfa.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,23 +13,14 @@
1313
from automata.fa.gnfa import GNFA
1414
from automata.fa.nfa import NFA
1515

16-
from zkregex_fuzzer.chars import SUPPORTED_CHARS
17-
18-
19-
def get_supported_symbols() -> set[str]:
20-
"""
21-
Get the set of symbols that are supported by the regex engine.
22-
"""
23-
# TODO make this configurable
24-
# Symbols should include at least all ASCII characters
25-
return SUPPORTED_CHARS
16+
from zkregex_fuzzer.chars import SupportedCharsManager
2617

2718

2819
def regex_to_nfa(regex: str) -> NFA:
2920
"""
3021
Convert a regex to an NFA.
3122
"""
32-
symbols = get_supported_symbols()
23+
symbols = SupportedCharsManager().get_chars().all_chars
3324
regex = unwrap_regex(regex)
3425

3526
try:
@@ -370,7 +361,9 @@ def dfa_string_matching(
370361
# TODO make this configurable
371362
max_length = 500
372363
# Convert regex to NFA
373-
nfa = NFA.from_regex(regex, input_symbols=get_supported_symbols())
364+
nfa = NFA.from_regex(
365+
regex, input_symbols=SupportedCharsManager().get_chars().all_chars
366+
)
374367

375368
# Start with the initial state and an empty string
376369
current_states = nfa._get_lambda_closures()[nfa.initial_state]

src/zkregex_fuzzer/fuzzer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ def fuzz_with_grammar(
5252

5353
if target_grammar == "basic":
5454
grammar = GRAMMARS[target_grammar]
55+
elif target_grammar == "controlled_utf8":
56+
grammar = GRAMMARS[target_grammar]
57+
elif target_grammar == "uncontrolled_utf8":
58+
grammar = GRAMMARS[target_grammar]
5559
elif target_grammar.endswith(".py"):
5660
try:
5761
# Get absolute path
@@ -277,7 +281,7 @@ def _process_results(regex, result):
277281
pbar.update(1)
278282
except concurrent.futures.TimeoutError:
279283
logger.error(
280-
f"Timeout after {timeout_per_regex*len(params)}s processing regexes"
284+
f"Timeout after {timeout_per_regex * len(params)}s processing regexes"
281285
)
282286
# Cancel any remaining futures before exiting the context
283287
for future in futures_to_regex:

src/zkregex_fuzzer/grammar.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@
2020
- Add more grammars.
2121
"""
2222

23+
import copy
2324
import string
2425
from typing import List
2526

2627
from fuzzingbook.Grammars import Expansion, Grammar
2728

29+
from zkregex_fuzzer.chars import CONTROLLED_UTF8_CHARS, UNCONTROLLED_UTF8_CHARS
30+
2831

2932
def srange(characters: str) -> List[Expansion]:
3033
"""Return a list of single-character expansions from the given string."""
@@ -136,6 +139,28 @@ def crange(start: str, end: str) -> List[Expansion]:
136139
"<ESCAPED>": [f"\\{c}" for c in "\\^$.|?*+()[]{}`-&"],
137140
}
138141

142+
CONTROLLED_UTF8_GRAMMAR: Grammar = copy.deepcopy(BASIC_REGEX_GRAMMAR)
143+
CONTROLLED_UTF8_GRAMMAR["<CHAR>"] = [
144+
"<LETTER>",
145+
"<DIGIT>",
146+
"<SYMBOL>",
147+
"<ESCAPED>",
148+
"<UTF8_CHAR>",
149+
]
150+
CONTROLLED_UTF8_GRAMMAR["<UTF8_CHAR>"] = list(CONTROLLED_UTF8_CHARS.non_escaped_chars)
151+
152+
UNCONTROLLED_UTF8_GRAMMAR: Grammar = copy.deepcopy(BASIC_REGEX_GRAMMAR)
153+
UNCONTROLLED_UTF8_GRAMMAR["<CHAR>"] = [
154+
"<LETTER>",
155+
"<DIGIT>",
156+
"<SYMBOL>",
157+
"<ESCAPED>",
158+
"<UTF8_CHAR>",
159+
]
160+
UNCONTROLLED_UTF8_GRAMMAR["<UTF8_CHAR>"] = list(
161+
UNCONTROLLED_UTF8_CHARS.non_escaped_chars
162+
)
163+
139164
OLD_GRAMMAR: Grammar = {
140165
# Entry point
141166
"<start>": ["<REGEX>"],

src/zkregex_fuzzer/invinpgen.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import exrex
1313

14-
from zkregex_fuzzer.chars import SUPPORTED_CHARS
14+
from zkregex_fuzzer.chars import SupportedCharsManager
1515
from zkregex_fuzzer.dfa import regex_to_nfa
1616
from zkregex_fuzzer.logger import logger
1717
from zkregex_fuzzer.utils import check_if_string_is_valid, extract_parts, pretty_regex
@@ -170,7 +170,11 @@ def _mutate_input(self, valid_input: str) -> str:
170170
if should_mutate:
171171
# Note that we can still mutate to a valid character
172172
invalid_input[i] = random.choice(
173-
list(SUPPORTED_CHARS.difference({invalid_input[i]}))
173+
list(
174+
SupportedCharsManager()
175+
.get_chars()
176+
.including_escaped_chars.difference({invalid_input[i]})
177+
)
174178
)
175179
if (
176180
not check_if_string_is_valid(self.regex, "".join(invalid_input))

src/zkregex_fuzzer/report.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class Configuration:
3030
logging_file: str | None
3131
output_path: str
3232
save_options: list[str]
33+
char_set: str
3334

3435

3536
class Stats:
@@ -181,6 +182,7 @@ def get_fuzzing_configuration_string(configuration: Configuration):
181182
Logging file: {configuration.logging_file}
182183
Output path: {configuration.output_path}
183184
Save options: {configuration.save_options}
185+
Char set: {configuration.char_set}
184186
"""
185187

186188

@@ -251,6 +253,7 @@ def print_fuzzing_configuration(configuration: Configuration):
251253
f"📥 Inputs num: {configuration.inputs_num}",
252254
f"🔍 Max non-terminals: {configuration.grammar_max_non_terminals}",
253255
f"🔍 Custom grammar: {configuration.grammar_custom_grammar}",
256+
f"🔍 Char set: {configuration.char_set}",
254257
f"🌱 Seed: {configuration.seed}",
255258
f"🔄 Num process: {configuration.num_process}",
256259
f"🔍 Logging file: {os.path.relpath(configuration.logging_file, os.getcwd()) if configuration.logging_file else 'None'}",

0 commit comments

Comments
 (0)