Skip to content

Commit 5c655a8

Browse files
committed
Do not scan whole file tree when making MANIFEST
When building a MANIFEST from a MANIFEST.in, setuptools previously scanned the whole directory tree in to a list, and then picked matching files based on MANIFEST.in commands from this list. Now, files are found using the `glob` library from Python 3.5. This only explores directories that need to be scanned, resulting in a large speed improvement for projects with large file trees. A modified `glob` module has been included. It has been changed to support back to Python 2.6, and to include `.hidden` files in its matches. The previous functionality included `.hidden` files in its glob matches. It is unclear if this behaviour is desired and required, or accidental and not required, but for strict backwards-compatibility, this behaviour is kept. Each command in the MANIFEST.in is now represented by its own function on the FileList (`include`, `exclude`, `graft`, etc.) to allow for an efficient implementation. The previous commands `FileList.include_pattern` and `FileList.exclude_pattern` still exist for backwards compatibility, but these use the slow 'scan all files' method, so are discouraged. `global_include` by its nature must scan all directories in the project to work, so this does not receive any speed improvements. The changes will speed up creating packages for the vast majority of users. There are a few unusual corner cases, such as multiple `graft` commands operating on the same set of directories, that will be slower. These can be solved by consolidating the overlapping `graft` commands in to one command.
1 parent 1aa7190 commit 5c655a8

File tree

3 files changed

+472
-62
lines changed

3 files changed

+472
-62
lines changed

setuptools/command/egg_info.py

Lines changed: 249 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,123 @@
22
33
Create a distribution's .egg-info directory and contents"""
44

5-
from distutils.filelist import FileList as _FileList
6-
from distutils.util import convert_path
7-
from distutils import log
5+
import collections
86
import distutils.errors
97
import distutils.filelist
8+
import io
109
import os
1110
import re
1211
import sys
13-
import io
14-
import warnings
1512
import time
16-
import collections
17-
18-
from setuptools.extern import six
19-
from setuptools.extern.six.moves import map
13+
import warnings
14+
from distutils import log
15+
from distutils.filelist import FileList as _FileList
16+
from distutils.filelist import translate_pattern
17+
from distutils.util import convert_path
18+
from fnmatch import translate
2019

21-
from setuptools import Command
22-
from setuptools.command.sdist import sdist
23-
from setuptools.command.sdist import walk_revctrl
24-
from setuptools.command.setopt import edit_config
25-
from setuptools.command import bdist_egg
26-
from pkg_resources import (
27-
parse_requirements, safe_name, parse_version,
28-
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
2920
import setuptools.unicode_utils as unicode_utils
30-
21+
from pkg_resources import (
22+
EntryPoint, iter_entry_points, parse_requirements, parse_version, safe_name,
23+
safe_version, to_filename, yield_lines)
3124
from pkg_resources.extern import packaging
25+
from setuptools import Command
26+
from setuptools.command import bdist_egg
27+
from setuptools.command.sdist import sdist, walk_revctrl
28+
from setuptools.command.setopt import edit_config
29+
from setuptools.extern import six
30+
from setuptools.extern.six.moves import map
31+
from setuptools.glob import glob
3232

3333
try:
3434
from setuptools_svn import svn_utils
3535
except ImportError:
3636
pass
3737

3838

39+
40+
def translate_pattern(glob):
41+
"""
42+
Translate a file path glob like '*.txt' in to a regular expression.
43+
This differs from fnmatch.translate which allows wildcards to match
44+
directory separators. It also knows about '**/' which matches any number of
45+
directories.
46+
"""
47+
pat = ''
48+
49+
# This will split on '/' within [character classes]. This is deliberate.
50+
chunks = glob.split(os.path.sep)
51+
52+
sep = re.escape(os.sep)
53+
valid_char = '[^%s]' % (sep,)
54+
55+
for c, chunk in enumerate(chunks):
56+
last_chunk = c == len(chunks) - 1
57+
58+
# Chunks that are a literal ** are globstars. They match anything.
59+
if chunk == '**':
60+
if last_chunk:
61+
# Match anything if this is the last component
62+
pat += '.*'
63+
else:
64+
# Match '(name/)*'
65+
pat += '(?:%s+%s)*' % (valid_char, sep)
66+
continue # Break here as the whole path component has been handled
67+
68+
# Find any special characters in the remainder
69+
i = 0
70+
chunk_len = len(chunk)
71+
while i < chunk_len:
72+
char = chunk[i]
73+
if char == '*':
74+
# Match any number of name characters
75+
pat += valid_char + '*'
76+
elif char == '?':
77+
# Match a name character
78+
pat += valid_char
79+
elif char == '[':
80+
# Character class
81+
inner_i = i + 1
82+
# Skip initial !/] chars
83+
if inner_i < chunk_len and chunk[inner_i] == '!':
84+
inner_i = inner_i + 1
85+
if inner_i < chunk_len and chunk[inner_i] == ']':
86+
inner_i = inner_i + 1
87+
88+
# Loop till the closing ] is found
89+
while inner_i < chunk_len and chunk[inner_i] != ']':
90+
inner_i = inner_i + 1
91+
92+
if inner_i >= chunk_len:
93+
# Got to the end of the string without finding a closing ]
94+
# Do not treat this as a matching group, but as a literal [
95+
pat += re.escape(char)
96+
else:
97+
# Grab the insides of the [brackets]
98+
inner = chunk[i + 1:inner_i]
99+
char_class = ''
100+
101+
# Class negation
102+
if inner[0] == '!':
103+
char_class = '^'
104+
inner = inner[1:]
105+
106+
char_class += re.escape(inner)
107+
pat += '[%s]' % (char_class,)
108+
109+
# Skip to the end ]
110+
i = inner_i
111+
else:
112+
pat += re.escape(char)
113+
i += 1
114+
115+
# Join each chunk with the dir separator
116+
if not last_chunk:
117+
pat += sep
118+
119+
return re.compile(pat + r'\Z(?ms)')
120+
121+
39122
class egg_info(Command):
40123
description = "create a distribution's .egg-info directory"
41124

@@ -239,7 +322,151 @@ def check_broken_egg_info(self):
239322

240323

241324
class FileList(_FileList):
242-
"""File list that accepts only existing, platform-independent paths"""
325+
# Implementations of the various MANIFEST.in commands
326+
327+
def process_template_line(self, line):
328+
# Parse the line: split it up, make sure the right number of words
329+
# is there, and return the relevant words. 'action' is always
330+
# defined: it's the first word of the line. Which of the other
331+
# three are defined depends on the action; it'll be either
332+
# patterns, (dir and patterns), or (dir_pattern).
333+
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)
334+
335+
# OK, now we know that the action is valid and we have the
336+
# right number of words on the line for that action -- so we
337+
# can proceed with minimal error-checking.
338+
if action == 'include':
339+
self.debug_print("include " + ' '.join(patterns))
340+
for pattern in patterns:
341+
if not self.include(pattern):
342+
log.warn("warning: no files found matching '%s'", pattern)
343+
344+
elif action == 'exclude':
345+
self.debug_print("exclude " + ' '.join(patterns))
346+
for pattern in patterns:
347+
if not self.exclude(pattern):
348+
log.warn(("warning: no previously-included files "
349+
"found matching '%s'"), pattern)
350+
351+
elif action == 'global-include':
352+
self.debug_print("global-include " + ' '.join(patterns))
353+
for pattern in patterns:
354+
if not self.global_include(pattern):
355+
log.warn(("warning: no files found matching '%s' "
356+
"anywhere in distribution"), pattern)
357+
358+
elif action == 'global-exclude':
359+
self.debug_print("global-exclude " + ' '.join(patterns))
360+
for pattern in patterns:
361+
if not self.global_exclude(pattern):
362+
log.warn(("warning: no previously-included files matching "
363+
"'%s' found anywhere in distribution"),
364+
pattern)
365+
366+
elif action == 'recursive-include':
367+
self.debug_print("recursive-include %s %s" %
368+
(dir, ' '.join(patterns)))
369+
for pattern in patterns:
370+
if not self.recursive_include(dir, pattern):
371+
log.warn(("warning: no files found matching '%s' "
372+
"under directory '%s'"),
373+
pattern, dir)
374+
375+
elif action == 'recursive-exclude':
376+
self.debug_print("recursive-exclude %s %s" %
377+
(dir, ' '.join(patterns)))
378+
for pattern in patterns:
379+
if not self.recursive_exclude(dir, pattern):
380+
log.warn(("warning: no previously-included files matching "
381+
"'%s' found under directory '%s'"),
382+
pattern, dir)
383+
384+
elif action == 'graft':
385+
self.debug_print("graft " + dir_pattern)
386+
if not self.graft(dir_pattern):
387+
log.warn("warning: no directories found matching '%s'",
388+
dir_pattern)
389+
390+
elif action == 'prune':
391+
self.debug_print("prune " + dir_pattern)
392+
if not self.prune(dir_pattern):
393+
log.warn(("no previously-included directories found "
394+
"matching '%s'"), dir_pattern)
395+
396+
else:
397+
raise DistutilsInternalError(
398+
"this cannot happen: invalid action '%s'" % action)
399+
400+
def _remove_files(self, predicate):
401+
"""
402+
Remove all files from the file list that match the predicate.
403+
Return True if any matching files were removed
404+
"""
405+
found = False
406+
for i in range(len(self.files) - 1, -1, -1):
407+
if predicate(self.files[i]):
408+
self.debug_print(" removing " + self.files[i])
409+
del self.files[i]
410+
found = True
411+
return found
412+
413+
def include(self, pattern):
414+
"""Include files that match 'pattera'n."""
415+
found = [f for f in glob(pattern) if not os.path.isdir(f)]
416+
self.extend(found)
417+
return bool(found)
418+
419+
def exclude(self, pattern):
420+
"""Exclude files that match 'pattern'."""
421+
match = translate_pattern(pattern)
422+
return self._remove_files(match.match)
423+
424+
def recursive_include(self, dir, pattern):
425+
"""
426+
Include all files anywhere in 'dir/' that match the pattern.
427+
"""
428+
full_pattern = os.path.join(dir, '**', pattern)
429+
found = [f for f in glob(full_pattern, recursive=True)
430+
if not os.path.isdir(f)]
431+
self.extend(found)
432+
return bool(found)
433+
434+
def recursive_exclude(self, dir, pattern):
435+
"""
436+
Exclude any file anywhere in 'dir/' that match the pattern.
437+
"""
438+
match = translate_pattern(os.path.join(dir, '**', pattern))
439+
return self._remove_files(match.match)
440+
441+
def graft(self, dir):
442+
"""Include all files from 'dir/'."""
443+
found = distutils.filelist.findall(dir)
444+
self.extend(found)
445+
return bool(found)
446+
447+
def prune(self, dir):
448+
"""Filter out files from 'dir/'."""
449+
match = translate_pattern(os.path.join(dir, '**'))
450+
return self._remove_files(match.match)
451+
452+
def global_include(self, pattern):
453+
"""
454+
Include all files anywhere in the current directory that match the
455+
pattern. This is very inefficient on large file trees.
456+
"""
457+
if self.allfiles is None:
458+
self.findall()
459+
match = translate_pattern(os.path.join('**', pattern))
460+
found = [f for f in self.allfiles if match.match(f)]
461+
self.extend(found)
462+
return bool(found)
463+
464+
def global_exclude(self, pattern):
465+
"""
466+
Exclude all files anywhere that match the pattern.
467+
"""
468+
match = translate_pattern(os.path.join('**', pattern))
469+
return self._remove_files(match.match)
243470

244471
def append(self, item):
245472
if item.endswith('\r'): # Fix older sdists built on Windows
@@ -302,7 +529,6 @@ def run(self):
302529
self.filelist = FileList()
303530
if not os.path.exists(self.manifest):
304531
self.write_manifest() # it must exist so it'll get in the list
305-
self.filelist.findall()
306532
self.add_defaults()
307533
if os.path.exists(self.template):
308534
self.read_template()
@@ -341,38 +567,13 @@ def add_defaults(self):
341567
elif os.path.exists(self.manifest):
342568
self.read_manifest()
343569
ei_cmd = self.get_finalized_command('egg_info')
344-
self._add_egg_info(cmd=ei_cmd)
345-
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)
346-
347-
def _add_egg_info(self, cmd):
348-
"""
349-
Add paths for egg-info files for an external egg-base.
350-
351-
The egg-info files are written to egg-base. If egg-base is
352-
outside the current working directory, this method
353-
searchs the egg-base directory for files to include
354-
in the manifest. Uses distutils.filelist.findall (which is
355-
really the version monkeypatched in by setuptools/__init__.py)
356-
to perform the search.
357-
358-
Since findall records relative paths, prefix the returned
359-
paths with cmd.egg_base, so add_default's include_pattern call
360-
(which is looking for the absolute cmd.egg_info) will match
361-
them.
362-
"""
363-
if cmd.egg_base == os.curdir:
364-
# egg-info files were already added by something else
365-
return
366-
367-
discovered = distutils.filelist.findall(cmd.egg_base)
368-
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
369-
self.filelist.allfiles.extend(resolved)
570+
self.filelist.graft(ei_cmd.egg_info)
370571

371572
def prune_file_list(self):
372573
build = self.get_finalized_command('build')
373574
base_dir = self.distribution.get_fullname()
374-
self.filelist.exclude_pattern(None, prefix=build.build_base)
375-
self.filelist.exclude_pattern(None, prefix=base_dir)
575+
self.filelist.prune(build.build_base)
576+
self.filelist.prune(base_dir)
376577
sep = re.escape(os.sep)
377578
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
378579
is_regex=1)

0 commit comments

Comments
 (0)