11# util.py
22import contextlib
3+ import re
34from functools import lru_cache , wraps
45import inspect
56import itertools
@@ -303,7 +304,11 @@ def _flatten(ll: Iterable) -> list:
303304
304305
305306def make_compressed_re (
306- word_list : Iterable [str ], max_level : int = 2 , _level : int = 1
307+ word_list : Iterable [str ],
308+ max_level : int = 2 ,
309+ * ,
310+ non_capturing_groups : bool = True ,
311+ _level : int = 1 ,
307312) -> str :
308313 """
309314 Create a regular expression string from a list of words, collapsing by common
@@ -320,37 +325,72 @@ def get_suffixes_from_common_prefixes(namelist: list[str]):
320325 else :
321326 yield namelist [0 ][0 ], [namelist [0 ][1 :]]
322327
328+ if _level == 1 :
329+ if not word_list :
330+ raise ValueError ("no words given to make_compressed_re()" )
331+
332+ if "" in word_list :
333+ raise ValueError ("word list cannot contain empty string" )
334+ else :
335+ # internal recursive call, just return empty string if no words
336+ if not word_list :
337+ return ""
338+
339+ # dedupe the word list
340+ word_list = list ({}.fromkeys (word_list ))
341+
323342 if max_level == 0 :
324- return "|" .join (sorted (word_list , key = len , reverse = True ))
343+ if any (len (wd ) > 1 for wd in word_list ):
344+ return "|" .join (
345+ sorted ([re .escape (wd ) for wd in word_list ], key = len , reverse = True )
346+ )
347+ else :
348+ return f"[{ '' .join (_escape_regex_range_chars (wd ) for wd in word_list )} ]"
325349
326350 ret = []
327351 sep = ""
352+ ncgroup = "?:" if non_capturing_groups else ""
353+
328354 for initial , suffixes in get_suffixes_from_common_prefixes (sorted (word_list )):
329355 ret .append (sep )
330356 sep = "|"
331357
358+ initial = re .escape (initial )
359+
332360 trailing = ""
333361 if "" in suffixes :
334362 trailing = "?"
335363 suffixes .remove ("" )
336364
337365 if len (suffixes ) > 1 :
338366 if all (len (s ) == 1 for s in suffixes ):
339- ret .append (f"{ initial } [{ '' .join (suffixes )} ]{ trailing } " )
367+ ret .append (
368+ f"{ initial } [{ '' .join (_escape_regex_range_chars (s ) for s in suffixes )} ]{ trailing } "
369+ )
340370 else :
341371 if _level < max_level :
342372 suffix_re = make_compressed_re (
343- sorted (suffixes ), max_level , _level + 1
373+ sorted (suffixes ),
374+ max_level ,
375+ non_capturing_groups = non_capturing_groups ,
376+ _level = _level + 1 ,
344377 )
345- ret .append (f"{ initial } ({ suffix_re } ){ trailing } " )
378+ ret .append (f"{ initial } ({ ncgroup } { suffix_re } ){ trailing } " )
346379 else :
347- suffixes .sort (key = len , reverse = True )
348- ret .append (f"{ initial } ({ '|' .join (suffixes )} ){ trailing } " )
380+ if all (len (s ) == 1 for s in suffixes ):
381+ ret .append (
382+ f"{ initial } [{ '' .join (_escape_regex_range_chars (s ) for s in suffixes )} ]{ trailing } "
383+ )
384+ else :
385+ suffixes .sort (key = len , reverse = True )
386+ ret .append (
387+ f"{ initial } ({ ncgroup } { '|' .join (re .escape (s ) for s in suffixes )} ){ trailing } "
388+ )
349389 else :
350390 if suffixes :
351- suffix = suffixes [0 ]
391+ suffix = re . escape ( suffixes [0 ])
352392 if len (suffix ) > 1 and trailing :
353- ret .append (f"{ initial } ({ suffix } ){ trailing } " )
393+ ret .append (f"{ initial } ({ ncgroup } { suffix } ){ trailing } " )
354394 else :
355395 ret .append (f"{ initial } { suffix } { trailing } " )
356396 else :
0 commit comments