diff --git a/news/7729.feature b/news/7729.feature new file mode 100644 index 00000000000..de4f6e7c8d7 --- /dev/null +++ b/news/7729.feature @@ -0,0 +1,3 @@ +Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run. + +This change significantly improves resolve performance when --find-links points to a very large html page. diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 033e37ac75b..e2c800c2cde 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -3,6 +3,7 @@ """ import cgi +import functools import itertools import logging import mimetypes @@ -25,8 +26,8 @@ if MYPY_CHECK_RUNNING: from typing import ( - Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple, - Union, + Callable, Iterable, List, MutableMapping, Optional, + Protocol, Sequence, Tuple, TypeVar, Union, ) import xml.etree.ElementTree @@ -38,10 +39,31 @@ HTMLElement = xml.etree.ElementTree.Element ResponseHeaders = MutableMapping[str, str] + # Used in the @lru_cache polyfill. + F = TypeVar('F') + + class LruCache(Protocol): + def __call__(self, maxsize=None): + # type: (Optional[int]) -> Callable[[F], F] + raise NotImplementedError + logger = logging.getLogger(__name__) +# Fallback to noop_lru_cache in Python 2 +# TODO: this can be removed when python 2 support is dropped! +def noop_lru_cache(maxsize=None): + # type: (Optional[int]) -> Callable[[F], F] + def _wrapper(f): + # type: (F) -> F + return f + return _wrapper + + +_lru_cache = getattr(functools, "lru_cache", noop_lru_cache) # type: LruCache + + def _match_vcs_scheme(url): # type: (str) -> Optional[str] """Look for VCS schemes in the URL. @@ -285,6 +307,48 @@ def _create_link_from_element( return link +class CacheablePageContent(object): + def __init__(self, page): + # type: (HTMLPage) -> None + assert page.cache_link_parsing + self.page = page + + def __eq__(self, other): + # type: (object) -> bool + return (isinstance(other, type(self)) and + self.page.url == other.page.url) + + def __hash__(self): + # type: () -> int + return hash(self.page.url) + + +def with_cached_html_pages( + fn, # type: Callable[[HTMLPage], Iterable[Link]] +): + # type: (...) -> Callable[[HTMLPage], List[Link]] + """ + Given a function that parses an Iterable[Link] from an HTMLPage, cache the + function's result (keyed by CacheablePageContent), unless the HTMLPage + `page` has `page.cache_link_parsing == False`. + """ + + @_lru_cache(maxsize=None) + def wrapper(cacheable_page): + # type: (CacheablePageContent) -> List[Link] + return list(fn(cacheable_page.page)) + + @functools.wraps(fn) + def wrapper_wrapper(page): + # type: (HTMLPage) -> List[Link] + if page.cache_link_parsing: + return wrapper(CacheablePageContent(page)) + return list(fn(page)) + + return wrapper_wrapper + + +@with_cached_html_pages def parse_links(page): # type: (HTMLPage) -> Iterable[Link] """ @@ -314,18 +378,23 @@ class HTMLPage(object): def __init__( self, - content, # type: bytes - encoding, # type: Optional[str] - url, # type: str + content, # type: bytes + encoding, # type: Optional[str] + url, # type: str + cache_link_parsing=True, # type: bool ): # type: (...) -> None """ :param encoding: the encoding to decode the given content. :param url: the URL from which the HTML was downloaded. + :param cache_link_parsing: whether links parsed from this page's url + should be cached. PyPI index urls should + have this set to False, for example. """ self.content = content self.encoding = encoding self.url = url + self.cache_link_parsing = cache_link_parsing def __str__(self): # type: () -> str @@ -343,10 +412,14 @@ def _handle_get_page_fail( meth("Could not fetch URL %s: %s - skipping", link, reason) -def _make_html_page(response): - # type: (Response) -> HTMLPage +def _make_html_page(response, cache_link_parsing=True): + # type: (Response, bool) -> HTMLPage encoding = _get_encoding_from_headers(response.headers) - return HTMLPage(response.content, encoding=encoding, url=response.url) + return HTMLPage( + response.content, + encoding=encoding, + url=response.url, + cache_link_parsing=cache_link_parsing) def _get_html_page(link, session=None): @@ -399,7 +472,8 @@ def _get_html_page(link, session=None): except requests.Timeout: _handle_get_page_fail(link, "timed out") else: - return _make_html_page(resp) + return _make_html_page(resp, + cache_link_parsing=link.cache_link_parsing) return None @@ -562,7 +636,9 @@ def collect_links(self, project_name): # We want to filter out anything that does not have a secure origin. url_locations = [ link for link in itertools.chain( - (Link(url) for url in index_url_loc), + # Mark PyPI indices as "cache_link_parsing == False" -- this + # will avoid caching the result of parsing the page for links. + (Link(url, cache_link_parsing=False) for url in index_url_loc), (Link(url) for url in fl_url_loc), ) if self.session.is_secure_origin(link) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index c3e743d3904..df4f8f01685 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -30,6 +30,7 @@ def __init__( comes_from=None, # type: Optional[Union[str, HTMLPage]] requires_python=None, # type: Optional[str] yanked_reason=None, # type: Optional[Text] + cache_link_parsing=True, # type: bool ): # type: (...) -> None """ @@ -46,6 +47,11 @@ def __init__( a simple repository HTML link. If the file has been yanked but no reason was provided, this should be the empty string. See PEP 592 for more information and the specification. + :param cache_link_parsing: A flag that is used elsewhere to determine + whether resources retrieved from this link + should be cached. PyPI index urls should + generally have this set to False, for + example. """ # url can be a UNC windows share @@ -63,6 +69,8 @@ def __init__( super(Link, self).__init__(key=url, defining_class=Link) + self.cache_link_parsing = cache_link_parsing + def __str__(self): # type: () -> str if self.requires_python: diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index a650a2a17ef..cfc2af1c07a 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1,5 +1,7 @@ import logging import os.path +import re +import uuid from textwrap import dedent import mock @@ -26,7 +28,7 @@ from pip._internal.models.index import PyPI from pip._internal.models.link import Link from pip._internal.network.session import PipSession -from tests.lib import make_test_link_collector +from tests.lib import make_test_link_collector, skip_if_python2 @pytest.mark.parametrize( @@ -355,7 +357,9 @@ def test_parse_links__yanked_reason(anchor_html, expected): page = HTMLPage( html_bytes, encoding=None, - url='https://example.com/simple/', + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url='https://example.com/simple-{}/'.format(uuid.uuid4()), ) links = list(parse_links(page)) link, = links @@ -363,6 +367,51 @@ def test_parse_links__yanked_reason(anchor_html, expected): assert actual == expected +@skip_if_python2 +def test_parse_links_caches_same_page_by_url(): + html = ( + '
' + '' + ) + html_bytes = html.encode('utf-8') + + url = 'https://example.com/simple/' + + page_1 = HTMLPage( + html_bytes, + encoding=None, + url=url, + ) + # Make a second page with zero content, to ensure that it's not accessed, + # because the page was cached by url. + page_2 = HTMLPage( + b'', + encoding=None, + url=url, + ) + # Make a third page which represents an index url, which should not be + # cached, even for the same url. We modify the page content slightly to + # verify that the result is not cached. + page_3 = HTMLPage( + re.sub(b'pkg1', b'pkg2', html_bytes), + encoding=None, + url=url, + cache_link_parsing=False, + ) + + parsed_links_1 = list(parse_links(page_1)) + assert len(parsed_links_1) == 1 + assert 'pkg1' in parsed_links_1[0].url + + parsed_links_2 = list(parse_links(page_2)) + assert parsed_links_2 == parsed_links_1 + + parsed_links_3 = list(parse_links(page_3)) + assert len(parsed_links_3) == 1 + assert parsed_links_3 != parsed_links_1 + assert 'pkg2' in parsed_links_3[0].url + + def test_request_http_error(caplog): caplog.set_level(logging.DEBUG) link = Link('http://localhost') @@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response): fake_response = make_fake_html_response(url) mock_get_html_response.return_value = fake_response - location = Link(url) + location = Link(url, cache_link_parsing=False) link_collector = make_test_link_collector() actual = link_collector.fetch_page(location) assert actual.content == fake_response.content assert actual.encoding is None assert actual.url == url + assert actual.cache_link_parsing == location.cache_link_parsing # Also check that the right session object was passed to # _get_html_response(). @@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data): assert len(actual.find_links) == 1 check_links_include(actual.find_links, names=['packages']) + # Check that find-links URLs are marked as cacheable. + assert actual.find_links[0].cache_link_parsing assert actual.project_urls == [Link('https://pypi.org/simple/twine/')] + # Check that index URLs are marked as *un*cacheable. + assert not actual.project_urls[0].cache_link_parsing expected_message = dedent("""\ 1 location(s) to search for versions of twine: