Skip to content

Commit bb45468

Browse files
committed
Much faster implementation of FileList, for big egg_info speedups
1 parent 7edbffc commit bb45468

File tree

3 files changed

+451
-44
lines changed

3 files changed

+451
-44
lines changed

setuptools/command/egg_info.py

Lines changed: 232 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Create a distribution's .egg-info directory and contents"""
44

55
from distutils.filelist import FileList as _FileList
6+
from distutils.errors import DistutilsInternalError
67
from distutils.util import convert_path
78
from distutils import log
89
import distutils.errors
@@ -27,6 +28,7 @@
2728
parse_requirements, safe_name, parse_version,
2829
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
2930
import setuptools.unicode_utils as unicode_utils
31+
from setuptools.glob import glob
3032

3133
from pkg_resources.extern import packaging
3234

@@ -36,6 +38,88 @@
3638
pass
3739

3840

41+
def translate_pattern(glob):
42+
"""
43+
Translate a file path glob like '*.txt' in to a regular expression.
44+
This differs from fnmatch.translate which allows wildcards to match
45+
directory separators. It also knows about '**/' which matches any number of
46+
directories.
47+
"""
48+
pat = ''
49+
50+
# This will split on '/' within [character classes]. This is deliberate.
51+
chunks = glob.split(os.path.sep)
52+
53+
sep = re.escape(os.sep)
54+
valid_char = '[^%s]' % (sep,)
55+
56+
for c, chunk in enumerate(chunks):
57+
last_chunk = c == len(chunks) - 1
58+
59+
# Chunks that are a literal ** are globstars. They match anything.
60+
if chunk == '**':
61+
if last_chunk:
62+
# Match anything if this is the last component
63+
pat += '.*'
64+
else:
65+
# Match '(name/)*'
66+
pat += '(?:%s+%s)*' % (valid_char, sep)
67+
continue # Break here as the whole path component has been handled
68+
69+
# Find any special characters in the remainder
70+
i = 0
71+
chunk_len = len(chunk)
72+
while i < chunk_len:
73+
char = chunk[i]
74+
if char == '*':
75+
# Match any number of name characters
76+
pat += valid_char + '*'
77+
elif char == '?':
78+
# Match a name character
79+
pat += valid_char
80+
elif char == '[':
81+
# Character class
82+
inner_i = i + 1
83+
# Skip initial !/] chars
84+
if inner_i < chunk_len and chunk[inner_i] == '!':
85+
inner_i = inner_i + 1
86+
if inner_i < chunk_len and chunk[inner_i] == ']':
87+
inner_i = inner_i + 1
88+
89+
# Loop till the closing ] is found
90+
while inner_i < chunk_len and chunk[inner_i] != ']':
91+
inner_i = inner_i + 1
92+
93+
if inner_i >= chunk_len:
94+
# Got to the end of the string without finding a closing ]
95+
# Do not treat this as a matching group, but as a literal [
96+
pat += re.escape(char)
97+
else:
98+
# Grab the insides of the [brackets]
99+
inner = chunk[i + 1:inner_i]
100+
char_class = ''
101+
102+
# Class negation
103+
if inner[0] == '!':
104+
char_class = '^'
105+
inner = inner[1:]
106+
107+
char_class += re.escape(inner)
108+
pat += '[%s]' % (char_class,)
109+
110+
# Skip to the end ]
111+
i = inner_i
112+
else:
113+
pat += re.escape(char)
114+
i += 1
115+
116+
# Join each chunk with the dir separator
117+
if not last_chunk:
118+
pat += sep
119+
120+
return re.compile(pat + r'\Z(?ms)')
121+
122+
39123
class egg_info(Command):
40124
description = "create a distribution's .egg-info directory"
41125

@@ -239,7 +323,151 @@ def check_broken_egg_info(self):
239323

240324

241325
class FileList(_FileList):
242-
"""File list that accepts only existing, platform-independent paths"""
326+
# Implementations of the various MANIFEST.in commands
327+
328+
def process_template_line(self, line):
329+
# Parse the line: split it up, make sure the right number of words
330+
# is there, and return the relevant words. 'action' is always
331+
# defined: it's the first word of the line. Which of the other
332+
# three are defined depends on the action; it'll be either
333+
# patterns, (dir and patterns), or (dir_pattern).
334+
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)
335+
336+
# OK, now we know that the action is valid and we have the
337+
# right number of words on the line for that action -- so we
338+
# can proceed with minimal error-checking.
339+
if action == 'include':
340+
self.debug_print("include " + ' '.join(patterns))
341+
for pattern in patterns:
342+
if not self.include(pattern):
343+
log.warn("warning: no files found matching '%s'", pattern)
344+
345+
elif action == 'exclude':
346+
self.debug_print("exclude " + ' '.join(patterns))
347+
for pattern in patterns:
348+
if not self.exclude(pattern):
349+
log.warn(("warning: no previously-included files "
350+
"found matching '%s'"), pattern)
351+
352+
elif action == 'global-include':
353+
self.debug_print("global-include " + ' '.join(patterns))
354+
for pattern in patterns:
355+
if not self.global_include(pattern):
356+
log.warn(("warning: no files found matching '%s' "
357+
"anywhere in distribution"), pattern)
358+
359+
elif action == 'global-exclude':
360+
self.debug_print("global-exclude " + ' '.join(patterns))
361+
for pattern in patterns:
362+
if not self.global_exclude(pattern):
363+
log.warn(("warning: no previously-included files matching "
364+
"'%s' found anywhere in distribution"),
365+
pattern)
366+
367+
elif action == 'recursive-include':
368+
self.debug_print("recursive-include %s %s" %
369+
(dir, ' '.join(patterns)))
370+
for pattern in patterns:
371+
if not self.recursive_include(dir, pattern):
372+
log.warn(("warning: no files found matching '%s' "
373+
"under directory '%s'"),
374+
pattern, dir)
375+
376+
elif action == 'recursive-exclude':
377+
self.debug_print("recursive-exclude %s %s" %
378+
(dir, ' '.join(patterns)))
379+
for pattern in patterns:
380+
if not self.recursive_exclude(dir, pattern):
381+
log.warn(("warning: no previously-included files matching "
382+
"'%s' found under directory '%s'"),
383+
pattern, dir)
384+
385+
elif action == 'graft':
386+
self.debug_print("graft " + dir_pattern)
387+
if not self.graft(dir_pattern):
388+
log.warn("warning: no directories found matching '%s'",
389+
dir_pattern)
390+
391+
elif action == 'prune':
392+
self.debug_print("prune " + dir_pattern)
393+
if not self.prune(dir_pattern):
394+
log.warn(("no previously-included directories found "
395+
"matching '%s'"), dir_pattern)
396+
397+
else:
398+
raise DistutilsInternalError(
399+
"this cannot happen: invalid action '%s'" % action)
400+
401+
def _remove_files(self, predicate):
402+
"""
403+
Remove all files from the file list that match the predicate.
404+
Return True if any matching files were removed
405+
"""
406+
found = False
407+
for i in range(len(self.files) - 1, -1, -1):
408+
if predicate(self.files[i]):
409+
self.debug_print(" removing " + self.files[i])
410+
del self.files[i]
411+
found = True
412+
return found
413+
414+
def include(self, pattern):
415+
"""Include files that match 'pattern'."""
416+
found = [f for f in glob(pattern) if not os.path.isdir(f)]
417+
self.extend(found)
418+
return bool(found)
419+
420+
def exclude(self, pattern):
421+
"""Exclude files that match 'pattern'."""
422+
match = translate_pattern(pattern)
423+
return self._remove_files(match.match)
424+
425+
def recursive_include(self, dir, pattern):
426+
"""
427+
Include all files anywhere in 'dir/' that match the pattern.
428+
"""
429+
full_pattern = os.path.join(dir, '**', pattern)
430+
found = [f for f in glob(full_pattern, recursive=True)
431+
if not os.path.isdir(f)]
432+
self.extend(found)
433+
return bool(found)
434+
435+
def recursive_exclude(self, dir, pattern):
436+
"""
437+
Exclude any file anywhere in 'dir/' that match the pattern.
438+
"""
439+
match = translate_pattern(os.path.join(dir, '**', pattern))
440+
return self._remove_files(match.match)
441+
442+
def graft(self, dir):
443+
"""Include all files from 'dir/'."""
444+
found = distutils.filelist.findall(dir)
445+
self.extend(found)
446+
return bool(found)
447+
448+
def prune(self, dir):
449+
"""Filter out files from 'dir/'."""
450+
match = translate_pattern(os.path.join(dir, '**'))
451+
return self._remove_files(match.match)
452+
453+
def global_include(self, pattern):
454+
"""
455+
Include all files anywhere in the current directory that match the
456+
pattern. This is very inefficient on large file trees.
457+
"""
458+
if self.allfiles is None:
459+
self.findall()
460+
match = translate_pattern(os.path.join('**', pattern))
461+
found = [f for f in self.allfiles if match.match(f)]
462+
self.extend(found)
463+
return bool(found)
464+
465+
def global_exclude(self, pattern):
466+
"""
467+
Exclude all files anywhere that match the pattern.
468+
"""
469+
match = translate_pattern(os.path.join('**', pattern))
470+
return self._remove_files(match.match)
243471

244472
def append(self, item):
245473
if item.endswith('\r'): # Fix older sdists built on Windows
@@ -302,7 +530,6 @@ def run(self):
302530
self.filelist = FileList()
303531
if not os.path.exists(self.manifest):
304532
self.write_manifest() # it must exist so it'll get in the list
305-
self.filelist.findall()
306533
self.add_defaults()
307534
if os.path.exists(self.template):
308535
self.read_template()
@@ -341,38 +568,13 @@ def add_defaults(self):
341568
elif os.path.exists(self.manifest):
342569
self.read_manifest()
343570
ei_cmd = self.get_finalized_command('egg_info')
344-
self._add_egg_info(cmd=ei_cmd)
345-
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)
346-
347-
def _add_egg_info(self, cmd):
348-
"""
349-
Add paths for egg-info files for an external egg-base.
350-
351-
The egg-info files are written to egg-base. If egg-base is
352-
outside the current working directory, this method
353-
searchs the egg-base directory for files to include
354-
in the manifest. Uses distutils.filelist.findall (which is
355-
really the version monkeypatched in by setuptools/__init__.py)
356-
to perform the search.
357-
358-
Since findall records relative paths, prefix the returned
359-
paths with cmd.egg_base, so add_default's include_pattern call
360-
(which is looking for the absolute cmd.egg_info) will match
361-
them.
362-
"""
363-
if cmd.egg_base == os.curdir:
364-
# egg-info files were already added by something else
365-
return
366-
367-
discovered = distutils.filelist.findall(cmd.egg_base)
368-
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
369-
self.filelist.allfiles.extend(resolved)
571+
self.filelist.graft(ei_cmd.egg_info)
370572

371573
def prune_file_list(self):
372574
build = self.get_finalized_command('build')
373575
base_dir = self.distribution.get_fullname()
374-
self.filelist.exclude_pattern(None, prefix=build.build_base)
375-
self.filelist.exclude_pattern(None, prefix=base_dir)
576+
self.filelist.prune(build.build_base)
577+
self.filelist.prune(base_dir)
376578
sep = re.escape(os.sep)
377579
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
378580
is_regex=1)

0 commit comments

Comments
 (0)