Skip to content

Commit 78cb377

Browse files
picnixzbarneygale
andauthored
gh-122288: Improve performances of fnmatch.translate (#122289)
Improve performance of this function by a factor of 1.7x. Co-authored-by: Barney Gale <[email protected]>
1 parent 14a05a8 commit 78cb377

File tree

4 files changed

+114
-49
lines changed

4 files changed

+114
-49
lines changed

Lib/fnmatch.py

Lines changed: 42 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -77,24 +77,30 @@ def translate(pat):
7777
There is no way to quote meta-characters.
7878
"""
7979

80-
STAR = object()
81-
parts = _translate(pat, STAR, '.')
82-
return _join_translated_parts(parts, STAR)
80+
parts, star_indices = _translate(pat, '*', '.')
81+
return _join_translated_parts(parts, star_indices)
8382

83+
_re_setops_sub = re.compile(r'([&~|])').sub
84+
_re_escape = functools.lru_cache(maxsize=512)(re.escape)
8485

85-
def _translate(pat, STAR, QUESTION_MARK):
86+
def _translate(pat, star, question_mark):
8687
res = []
8788
add = res.append
89+
star_indices = []
90+
8891
i, n = 0, len(pat)
8992
while i < n:
9093
c = pat[i]
9194
i = i+1
9295
if c == '*':
96+
# store the position of the wildcard
97+
star_indices.append(len(res))
98+
add(star)
9399
# compress consecutive `*` into one
94-
if (not res) or res[-1] is not STAR:
95-
add(STAR)
100+
while i < n and pat[i] == '*':
101+
i += 1
96102
elif c == '?':
97-
add(QUESTION_MARK)
103+
add(question_mark)
98104
elif c == '[':
99105
j = i
100106
if j < n and pat[j] == '!':
@@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
133139
# Hyphens that create ranges shouldn't be escaped.
134140
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
135141
for s in chunks)
136-
# Escape set operations (&&, ~~ and ||).
137-
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
138142
i = j+1
139143
if not stuff:
140144
# Empty range: never match.
@@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
143147
# Negated empty range: match any character.
144148
add('.')
145149
else:
150+
# Escape set operations (&&, ~~ and ||).
151+
stuff = _re_setops_sub(r'\\\1', stuff)
146152
if stuff[0] == '!':
147153
stuff = '^' + stuff[1:]
148154
elif stuff[0] in ('^', '['):
149155
stuff = '\\' + stuff
150156
add(f'[{stuff}]')
151157
else:
152-
add(re.escape(c))
153-
assert i == n
154-
return res
155-
156-
157-
def _join_translated_parts(inp, STAR):
158-
# Deal with STARs.
159-
res = []
160-
add = res.append
161-
i, n = 0, len(inp)
162-
# Fixed pieces at the start?
163-
while i < n and inp[i] is not STAR:
164-
add(inp[i])
165-
i += 1
166-
# Now deal with STAR fixed STAR fixed ...
167-
# For an interior `STAR fixed` pairing, we want to do a minimal
168-
# .*? match followed by `fixed`, with no possibility of backtracking.
169-
# Atomic groups ("(?>...)") allow us to spell that directly.
170-
# Note: people rely on the undocumented ability to join multiple
171-
# translate() results together via "|" to build large regexps matching
172-
# "one of many" shell patterns.
173-
while i < n:
174-
assert inp[i] is STAR
175-
i += 1
176-
if i == n:
177-
add(".*")
178-
break
179-
assert inp[i] is not STAR
180-
fixed = []
181-
while i < n and inp[i] is not STAR:
182-
fixed.append(inp[i])
183-
i += 1
184-
fixed = "".join(fixed)
185-
if i == n:
186-
add(".*")
187-
add(fixed)
188-
else:
189-
add(f"(?>.*?{fixed})")
158+
add(_re_escape(c))
190159
assert i == n
191-
res = "".join(res)
160+
return res, star_indices
161+
162+
163+
def _join_translated_parts(parts, star_indices):
164+
if not star_indices:
165+
return fr'(?s:{"".join(parts)})\Z'
166+
iter_star_indices = iter(star_indices)
167+
j = next(iter_star_indices)
168+
buffer = parts[:j] # fixed pieces at the start
169+
append, extend = buffer.append, buffer.extend
170+
i = j + 1
171+
for j in iter_star_indices:
172+
# Now deal with STAR fixed STAR fixed ...
173+
# For an interior `STAR fixed` pairing, we want to do a minimal
174+
# .*? match followed by `fixed`, with no possibility of backtracking.
175+
# Atomic groups ("(?>...)") allow us to spell that directly.
176+
# Note: people rely on the undocumented ability to join multiple
177+
# translate() results together via "|" to build large regexps matching
178+
# "one of many" shell patterns.
179+
append('(?>.*?')
180+
extend(parts[i:j])
181+
append(')')
182+
i = j + 1
183+
append('.*')
184+
extend(parts[i:])
185+
res = ''.join(buffer)
192186
return fr'(?s:{res})\Z'

Lib/glob.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
312312
if part:
313313
if not include_hidden and part[0] in '*?':
314314
results.append(r'(?!\.)')
315-
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
315+
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
316316
if idx < last_part_idx:
317317
results.append(any_sep)
318318
res = ''.join(results)

Lib/test/test_fnmatch.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,75 @@ def test_translate(self):
250250
self.assertTrue(re.match(fatre, 'cbabcaxc'))
251251
self.assertFalse(re.match(fatre, 'dabccbad'))
252252

253+
def test_translate_wildcards(self):
254+
for pattern, expect in [
255+
('ab*', r'(?s:ab.*)\Z'),
256+
('ab*cd', r'(?s:ab.*cd)\Z'),
257+
('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
258+
('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
259+
('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
260+
('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
261+
('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
262+
]:
263+
with self.subTest(pattern):
264+
translated = translate(pattern)
265+
self.assertEqual(translated, expect, pattern)
266+
267+
for pattern, expect in [
268+
('*ab', r'(?s:.*ab)\Z'),
269+
('*ab*', r'(?s:(?>.*?ab).*)\Z'),
270+
('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
271+
('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
272+
('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
273+
('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
274+
('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
275+
('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
276+
]:
277+
with self.subTest(pattern):
278+
translated = translate(pattern)
279+
self.assertEqual(translated, expect, pattern)
280+
281+
def test_translate_expressions(self):
282+
for pattern, expect in [
283+
('[', r'(?s:\[)\Z'),
284+
('[!', r'(?s:\[!)\Z'),
285+
('[]', r'(?s:\[\])\Z'),
286+
('[abc', r'(?s:\[abc)\Z'),
287+
('[!abc', r'(?s:\[!abc)\Z'),
288+
('[abc]', r'(?s:[abc])\Z'),
289+
('[!abc]', r'(?s:[^abc])\Z'),
290+
('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
291+
# with [[
292+
('[[', r'(?s:\[\[)\Z'),
293+
('[[a', r'(?s:\[\[a)\Z'),
294+
('[[]', r'(?s:[\[])\Z'),
295+
('[[]a', r'(?s:[\[]a)\Z'),
296+
('[[]]', r'(?s:[\[]\])\Z'),
297+
('[[]a]', r'(?s:[\[]a\])\Z'),
298+
('[[a]', r'(?s:[\[a])\Z'),
299+
('[[a]]', r'(?s:[\[a]\])\Z'),
300+
('[[a]b', r'(?s:[\[a]b)\Z'),
301+
# backslashes
302+
('[\\', r'(?s:\[\\)\Z'),
303+
(r'[\]', r'(?s:[\\])\Z'),
304+
(r'[\\]', r'(?s:[\\\\])\Z'),
305+
]:
306+
with self.subTest(pattern):
307+
translated = translate(pattern)
308+
self.assertEqual(translated, expect, pattern)
309+
310+
def test_star_indices_locations(self):
311+
from fnmatch import _translate
312+
313+
blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
314+
parts, star_indices = _translate(''.join(blocks), '*', '.')
315+
expect_parts = ['a', r'\^', 'b', '*',
316+
'.', '.', '[a-z]', '[1-9]', '*',
317+
r'\+', r'\+', r'\[', r'\[', 'a']
318+
self.assertListEqual(parts, expect_parts)
319+
self.assertListEqual(star_indices, [3, 8])
320+
321+
253322
class FilterTestCase(unittest.TestCase):
254323

255324
def test_filter(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
2+
by Bénédikt Tran.

0 commit comments

Comments
 (0)