diff --git a/news/12186.bugfix.rst b/news/12186.bugfix.rst new file mode 100644 index 00000000000..b51d84a0cb2 --- /dev/null +++ b/news/12186.bugfix.rst @@ -0,0 +1 @@ +Avoid downloading any dists in ``install --dry-run`` if PEP 658 ``.metadata`` files or lazy wheels are available. diff --git a/news/12256.feature.rst b/news/12256.feature.rst new file mode 100644 index 00000000000..0b5bb356188 --- /dev/null +++ b/news/12256.feature.rst @@ -0,0 +1 @@ +Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled. diff --git a/news/12257.feature.rst b/news/12257.feature.rst new file mode 100644 index 00000000000..21ea8cf5942 --- /dev/null +++ b/news/12257.feature.rst @@ -0,0 +1 @@ +Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled. diff --git a/news/12863.trivial.rst b/news/12863.trivial.rst new file mode 100644 index 00000000000..dc36a82a0df --- /dev/null +++ b/news/12863.trivial.rst @@ -0,0 +1 @@ +Cache "concrete" dists by ``Distribution`` instead of ``InstallRequirement``. diff --git a/news/12871.trivial.rst b/news/12871.trivial.rst new file mode 100644 index 00000000000..186e2bcb3c4 --- /dev/null +++ b/news/12871.trivial.rst @@ -0,0 +1 @@ +Refactor much of ``RequirementPreparer`` to avoid duplicated code paths for metadata-only requirements. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index ce98f288395..0f1c54c328c 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -2,12 +2,14 @@ from __future__ import annotations +import abc import hashlib import json import logging import os +import re +from collections.abc import Iterator from pathlib import Path -from typing import Any from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name @@ -16,21 +18,71 @@ from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel +from pip._internal.req.req_install import InstallRequirement from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds from pip._internal.utils.urls import path_to_url +from pip._internal.vcs import vcs logger = logging.getLogger(__name__) +_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) + ORIGIN_JSON_NAME = "origin.json" +def _contains_egg_info(s: str) -> bool: + """Determine whether the string looks like an egg_info. + + :param s: The string to parse. E.g. foo-2.1 + """ + return bool(_egg_info_re.search(s)) + + +def should_cache( + req: InstallRequirement, +) -> bool: + """ + Return whether a built InstallRequirement can be stored in the persistent + wheel cache, assuming the wheel cache is available, and _should_build() + has determined a wheel needs to be built. + """ + if not req.link: + return False + + if req.link.is_wheel: + return False + + if req.editable or not req.source_dir: + # never cache editable requirements + return False + + if req.link and req.link.is_vcs: + # VCS checkout. Do not cache + # unless it points to an immutable commit hash. + assert not req.editable + assert req.source_dir + vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) + assert vcs_backend + if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): + return True + return False + + assert req.link + base, ext = req.link.splitext() + if _contains_egg_info(base): + return True + + # Otherwise, do not cache. + return False + + def _hash_dict(d: dict[str, str]) -> str: """Return a stable sha224 of a dictionary.""" s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True) return hashlib.sha224(s.encode("ascii")).hexdigest() -class Cache: +class Cache(abc.ABC): """An abstract class - provides cache directories for data from links :param cache_dir: The root of the cache. @@ -41,7 +93,9 @@ def __init__(self, cache_dir: str) -> None: assert not cache_dir or os.path.isabs(cache_dir) self.cache_dir = cache_dir or None - def _get_cache_path_parts(self, link: Link) -> list[str]: + def _get_cache_path_parts( + self, link: Link, *, interpreter_dependent: bool + ) -> list[str]: """Get parts of part that must be os.path.joined with cache_dir""" # We want to generate an url to use as our cache key, we don't want to @@ -53,13 +107,14 @@ def _get_cache_path_parts(self, link: Link) -> list[str]: if link.subdirectory_fragment: key_parts["subdirectory"] = link.subdirectory_fragment - # Include interpreter name, major and minor version in cache key - # to cope with ill-behaved sdists that build a different wheel - # depending on the python version their setup.py is being run on, - # and don't encode the difference in compatibility tags. - # https://github.com/pypa/pip/issues/7296 - key_parts["interpreter_name"] = interpreter_name() - key_parts["interpreter_version"] = interpreter_version() + if interpreter_dependent: + # Include interpreter name, major and minor version in cache key + # to cope with ill-behaved sdists that build a different wheel + # depending on the python version their setup.py is being run on, + # and don't encode the difference in compatibility tags. + # https://github.com/pypa/pip/issues/7296 + key_parts["interpreter_name"] = interpreter_name() + key_parts["interpreter_version"] = interpreter_version() # Encode our key url with sha224, we'll use this because it has similar # security properties to sha256, but with a shorter total output (and @@ -74,20 +129,37 @@ def _get_cache_path_parts(self, link: Link) -> list[str]: return parts - def _get_candidates(self, link: Link, canonical_package_name: str) -> list[Any]: - can_not_cache = not self.cache_dir or not canonical_package_name or not link - if can_not_cache: - return [] + @abc.abstractmethod + def get_path_for_link(self, link: Link) -> str: + """Return a directory to store cached items in for link.""" + ... - path = self.get_path_for_link(link) - if os.path.isdir(path): - return [(candidate, path) for candidate in os.listdir(path)] - return [] + def cache_path(self, link: Link) -> Path: + return Path(self.get_path_for_link(link)) + + +class LinkMetadataCache(Cache): + """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - """Return a directory to store cached items in for link.""" - raise NotImplementedError() + parts = self._get_cache_path_parts(link, interpreter_dependent=True) + assert self.cache_dir + return os.path.join(self.cache_dir, "link-metadata", *parts) + + +class FetchResolveCache(Cache): + def get_path_for_link(self, link: Link) -> str: + # We are reading index links to extract other links from, not executing any + # python code, so these caches are interpreter-independent. + parts = self._get_cache_path_parts(link, interpreter_dependent=False) + assert self.cache_dir + return os.path.join(self.cache_dir, "fetch-resolve", *parts) + +class WheelCacheBase(Cache): + """Specializations to the cache concept for wheels.""" + + @abc.abstractmethod def get( self, link: Link, @@ -97,10 +169,27 @@ def get( """Returns a link to a cached item if it exists, otherwise returns the passed link. """ - raise NotImplementedError() + ... + + def _can_cache(self, link: Link, canonical_package_name: str) -> bool: + return bool(self.cache_dir and canonical_package_name and link) + + def _get_candidates( + self, link: Link, canonical_package_name: str + ) -> Iterator[tuple[str, str]]: + if not self._can_cache(link, canonical_package_name): + return + path = self.get_path_for_link(link) + if not os.path.isdir(path): + return + + for candidate in os.scandir(path): + if candidate.is_file(): + yield (candidate.name, path) -class SimpleWheelCache(Cache): + +class SimpleWheelCache(WheelCacheBase): """A cache of wheels for future installs.""" def __init__(self, cache_dir: str) -> None: @@ -121,7 +210,7 @@ def get_path_for_link(self, link: Link) -> str: :param link: The link of the sdist for which this will cache wheels. """ - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir # Store wheels within the root cache_dir return os.path.join(self.cache_dir, "wheels", *parts) @@ -132,7 +221,7 @@ def get( package_name: str | None, supported_tags: list[Tag], ) -> Link: - candidates = [] + candidates: list[tuple[int, str, str]] = [] if not package_name: return link @@ -206,7 +295,7 @@ def __init__( ) -class WheelCache(Cache): +class WheelCache(WheelCacheBase): """Wraps EphemWheelCache and SimpleWheelCache into a single Cache This Cache allows for gracefully degradation, using the ephem wheel cache @@ -224,6 +313,15 @@ def get_path_for_link(self, link: Link) -> str: def get_ephem_path_for_link(self, link: Link) -> str: return self._ephem_cache.get_path_for_link(link) + def resolve_cache_dir(self, req: InstallRequirement) -> str: + """Return the persistent or temporary cache directory where the built or + downloaded wheel should be stored.""" + cache_available = bool(self.cache_dir) + assert req.link, req + if cache_available and should_cache(req): + return self.get_path_for_link(req.link) + return self.get_ephem_path_for_link(req.link) + def get( self, link: Link, diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index 3519dadf13d..68f3927d808 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -1072,6 +1072,8 @@ def check_list_path_option(options: Values) -> None: default=[], choices=[ "fast-deps", + "metadata-cache", + "truststore", ] + ALWAYS_ENABLED_FEATURES, help="Enable new functionality, that may be backward incompatible.", diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index dc1328ff019..864cc2959c5 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -13,7 +13,7 @@ from typing import Any from pip._internal.build_env import SubprocessBuildEnvironmentInstaller -from pip._internal.cache import WheelCache +from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.index_command import IndexGroupCommand from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin @@ -132,6 +132,16 @@ def make_requirement_preparer( "fast-deps has no effect when used with the legacy resolver." ) + if options.cache_dir and "metadata-cache" in options.features_enabled: + logger.warning( + "pip is using a local cache for metadata information. " + "This experimental feature is enabled through " + "--use-feature=metadata-cache and it is not ready for " + "production." + ) + metadata_cache = LinkMetadataCache(options.cache_dir) + else: + metadata_cache = None return RequirementPreparer( build_dir=temp_build_dir_path, src_dir=options.src_dir, @@ -149,6 +159,7 @@ def make_requirement_preparer( verbosity=verbosity, legacy_resolver=legacy_resolver, resume_retries=options.resume_retries, + metadata_cache=metadata_cache, ) @classmethod @@ -344,8 +355,13 @@ def _build_package_finder( ignore_requires_python=ignore_requires_python, ) + if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled): + fetch_resolve_cache = FetchResolveCache(options.cache_dir) + else: + fetch_resolve_cache = None return PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + fetch_resolve_cache=fetch_resolve_cache, ) diff --git a/src/pip/_internal/commands/download.py b/src/pip/_internal/commands/download.py index 900fb403d6f..a79fdec3050 100644 --- a/src/pip/_internal/commands/download.py +++ b/src/pip/_internal/commands/download.py @@ -129,6 +129,9 @@ def run(self, options: Values, args: list[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) downloaded: list[str] = [] for req in requirement_set.requirements.values(): @@ -137,8 +140,6 @@ def run(self, options: Values, args: list[str]) -> int: preparer.save_linked_requirement(req) downloaded.append(req.name) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - if downloaded: write_output("Successfully downloaded %s", " ".join(downloaded)) diff --git a/src/pip/_internal/commands/install.py b/src/pip/_internal/commands/install.py index 1ef7a0f4410..3f5d3e6c256 100644 --- a/src/pip/_internal/commands/install.py +++ b/src/pip/_internal/commands/install.py @@ -99,7 +99,8 @@ def add_options(self) -> None: help=( "Don't actually install anything, just print what would be. " "Can be used in combination with --ignore-installed " - "to 'resolve' the requirements." + "to 'resolve' the requirements. If package metadata is available " + "or cached, --dry-run also avoids downloading the dependency at all." ), ) self.cmd_opts.add_option( @@ -393,6 +394,10 @@ def run(self, options: Values, args: list[str]) -> int: requirement_set = resolver.resolve( reqs, check_supported_wheels=not options.target_dir ) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), + require_dist_files=not options.dry_run, + ) if options.json_report_file: report = InstallationReport(requirement_set.requirements_to_install) diff --git a/src/pip/_internal/commands/wheel.py b/src/pip/_internal/commands/wheel.py index 61be254912f..9519a940fc9 100644 --- a/src/pip/_internal/commands/wheel.py +++ b/src/pip/_internal/commands/wheel.py @@ -144,6 +144,9 @@ def run(self, options: Values, args: list[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) reqs_to_build: list[InstallRequirement] = [] for req in requirement_set.requirements.values(): @@ -152,8 +155,6 @@ def run(self, options: Values, args: list[str]) -> int: else: reqs_to_build.append(req) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - # build wheels build_successes, build_failures = build( reqs_to_build, diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index 9a89a838b9a..a870b227d1e 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -1,4 +1,5 @@ from pip._internal.distributions.base import AbstractDistribution +from pip._internal.distributions.installed import InstalledDistribution from pip._internal.distributions.sdist import SourceDistribution from pip._internal.distributions.wheel import WheelDistribution from pip._internal.req.req_install import InstallRequirement @@ -7,7 +8,18 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: - """Returns a Distribution for the given InstallRequirement""" + """Returns an AbstractDistribution for the given InstallRequirement. + + As AbstractDistribution only covers installable artifacts, this method may only be + invoked at the conclusion of a resolve, when the RequirementPreparer has downloaded + the file corresponding to the resolved dist. Commands which intend to consume + metadata-only resolves without downloading should not call this method or + consume AbstractDistribution objects. + """ + # Only pre-installed requirements will have a .satisfied_by dist. + if install_req.satisfied_by: + return InstalledDistribution(install_req) + # Editable requirements will always be source distributions. They use the # legacy logic until we create a modern standard for them. if install_req.editable: diff --git a/src/pip/_internal/distributions/base.py b/src/pip/_internal/distributions/base.py index ea61f3501e7..3ccb5bd0288 100644 --- a/src/pip/_internal/distributions/base.py +++ b/src/pip/_internal/distributions/base.py @@ -39,11 +39,17 @@ def build_tracker_id(self) -> str | None: If None, then this dist has no work to do in the build tracker, and ``.prepare_distribution_metadata()`` will not be called.""" - raise NotImplementedError() + ... @abc.abstractmethod def get_metadata_distribution(self) -> BaseDistribution: - raise NotImplementedError() + """Generate a concrete ``BaseDistribution`` instance for this artifact. + + The implementation should also cache the result with + ``self.req.cache_concrete_dist()`` so the distribution is available to other + users of the ``InstallRequirement``. This method is not called within the build + tracker context, so it should not identify any new setup requirements.""" + ... @abc.abstractmethod def prepare_distribution_metadata( @@ -52,4 +58,11 @@ def prepare_distribution_metadata( build_isolation: bool, check_build_deps: bool, ) -> None: - raise NotImplementedError() + """Generate the information necessary to extract metadata from the artifact. + + This method will be executed within the context of ``BuildTracker#track()``, so + it needs to fully identify any setup requirements so they can be added to the + same active set of tracked builds, while ``.get_metadata_distribution()`` takes + care of generating and caching the ``BaseDistribution`` to expose to the rest of + the resolve.""" + ... diff --git a/src/pip/_internal/distributions/installed.py b/src/pip/_internal/distributions/installed.py index b6a67df24f4..5e6ee336866 100644 --- a/src/pip/_internal/distributions/installed.py +++ b/src/pip/_internal/distributions/installed.py @@ -21,8 +21,10 @@ def build_tracker_id(self) -> str | None: return None def get_metadata_distribution(self) -> BaseDistribution: - assert self.req.satisfied_by is not None, "not actually installed" - return self.req.satisfied_by + dist = self.req.satisfied_by + assert dist is not None, "not actually installed" + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/distributions/sdist.py b/src/pip/_internal/distributions/sdist.py index e2821f89e00..c3b91cef621 100644 --- a/src/pip/_internal/distributions/sdist.py +++ b/src/pip/_internal/distributions/sdist.py @@ -7,7 +7,7 @@ from pip._internal.build_env import BuildEnvironment from pip._internal.distributions.base import AbstractDistribution from pip._internal.exceptions import InstallationError -from pip._internal.metadata import BaseDistribution +from pip._internal.metadata import BaseDistribution, get_directory_distribution from pip._internal.utils.subprocess import runner_with_spinner_message if TYPE_CHECKING: @@ -24,13 +24,19 @@ class SourceDistribution(AbstractDistribution): """ @property - def build_tracker_id(self) -> str | None: + def build_tracker_id(self) -> str: """Identify this requirement uniquely by its link.""" assert self.req.link return self.req.link.url_without_fragment def get_metadata_distribution(self) -> BaseDistribution: - return self.req.get_dist() + assert ( + self.req.metadata_directory + ), "Set as part of .prepare_distribution_metadata()" + dist = get_directory_distribution(self.req.metadata_directory) + self.req.cache_concrete_dist(dist) + self.req.validate_sdist_metadata() + return dist def prepare_distribution_metadata( self, @@ -69,7 +75,11 @@ def prepare_distribution_metadata( self._raise_conflicts("the backend dependencies", conflicting) if missing: self._raise_missing_reqs(missing) - self.req.prepare_metadata() + + # NB: we must still call .cache_concrete_dist() and .validate_sdist_metadata() + # before the InstallRequirement itself has been updated with the metadata from + # this directory! + self.req.prepare_metadata_directory() def _prepare_build_backend( self, build_env_installer: BuildEnvironmentInstaller diff --git a/src/pip/_internal/distributions/wheel.py b/src/pip/_internal/distributions/wheel.py index ee12bfadc2e..61898712d42 100644 --- a/src/pip/_internal/distributions/wheel.py +++ b/src/pip/_internal/distributions/wheel.py @@ -33,7 +33,9 @@ def get_metadata_distribution(self) -> BaseDistribution: assert self.req.local_file_path, "Set as part of preparation during download" assert self.req.name, "Wheels are never unnamed" wheel = FilesystemWheel(self.req.local_file_path) - return get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + dist = get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 98f95494c62..4d90f6877fe 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -252,6 +252,25 @@ def __str__(self) -> str: return f"None {self.metadata_name} metadata found for distribution: {self.dist}" +class CacheMetadataError(PipError): + """Raised when de/serializing a requirement into the metadata cache.""" + + def __init__( + self, + req: InstallRequirement, + reason: str, + ) -> None: + """ + :param req: The requirement we attempted to cache. + :param reason: Context about the precise error that occurred. + """ + self.req = req + self.reason = reason + + def __str__(self) -> str: + return f"{self.reason} for {self.req} from {self.req.link}" + + class UserInstallationInvalid(InstallationError): """A --user install is requested on an environment without user site.""" diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 00d66daa3bf..fbbeaf5a23e 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -87,7 +87,9 @@ class _NotHTTP(Exception): pass -def _ensure_api_response(url: str, session: PipSession) -> None: +def _ensure_api_response( + url: str, session: PipSession, headers: dict[str, str] | None = None +) -> None: """ Send a HEAD request to the URL, and ensure the response contains a simple API Response. @@ -99,13 +101,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None: if scheme not in {"http", "https"}: raise _NotHTTP() - resp = session.head(url, allow_redirects=True) + resp = session.head(url, allow_redirects=True, headers=headers) raise_for_status(resp) _ensure_api_header(resp) -def _get_simple_response(url: str, session: PipSession) -> Response: +def _get_simple_response( + url: str, session: PipSession, headers: dict[str, str] | None = None +) -> Response: """Access an Simple API response with GET, and return the response. This consists of three parts: @@ -119,10 +123,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response: and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_api_response(url, session=session) + _ensure_api_response(url, session=session, headers=headers) logger.debug("Getting page %s", redact_auth_from_url(url)) + logger.debug("headers: %s", str(headers)) + if headers is None: + headers = {} resp = session.get( url, headers={ @@ -147,6 +154,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response: # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", + **headers, }, ) raise_for_status(resp) @@ -225,7 +233,7 @@ def parse_links(page: IndexContent) -> Iterable[Link]: if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json(file, page.url, page_content=page) if link is None: continue yield link @@ -238,7 +246,9 @@ def parse_links(page: IndexContent) -> Iterable[Link]: url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + link = Link.from_element( + anchor, page_url=url, base_url=base_url, page_content=page + ) if link is None: continue yield link @@ -253,6 +263,8 @@ class IndexContent: :param cache_link_parsing: whether links parsed from this page's url should be cached. PyPI index urls should have this set to False, for example. + :param etag: The ``ETag`` header from an HTTP request against ``url``. + :param date: The ``Date`` header from an HTTP request against ``url``. """ content: bytes @@ -260,6 +272,8 @@ class IndexContent: encoding: str | None url: str cache_link_parsing: bool = True + etag: str | None = None + date: str | None = None def __str__(self) -> str: return redact_auth_from_url(self.url) @@ -304,7 +318,8 @@ def _handle_get_simple_fail( def _make_index_content( - response: Response, cache_link_parsing: bool = True + response: Response, + cache_link_parsing: bool = True, ) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) return IndexContent( @@ -313,11 +328,15 @@ def _make_index_content( encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, + etag=response.headers.get("ETag", None), + date=response.headers.get("Date", None), ) -def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | None: - url = link.url.split("#", 1)[0] +def _get_index_content( + link: Link, *, session: PipSession, headers: dict[str, str] | None = None +) -> IndexContent | None: + url = link.url_without_fragment # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) @@ -344,7 +363,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non logger.debug(" file: URL is directory, getting %s", url) try: - resp = _get_simple_response(url, session=session) + resp = _get_simple_response(url, session=session, headers=headers) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " @@ -360,9 +379,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non exc.request_desc, exc.content_type, ) - except NetworkConnectionError as exc: - _handle_get_simple_fail(link, exc) - except RetryError as exc: + except (NetworkConnectionError, RetryError) as exc: _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " @@ -436,11 +453,14 @@ def create( def find_links(self) -> list[str]: return self.search_scope.find_links - def fetch_response(self, location: Link) -> IndexContent | None: + def fetch_response( + self, location: Link, headers: dict[str, str] | None = None + ) -> IndexContent | None: """ Fetch an HTML page containing package links. """ - return _get_index_content(location, session=self.session) + logger.debug("headers: %s", str(headers)) + return _get_index_content(location, session=self.session, headers=headers) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index bc523cd42d8..633f1eb855e 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -2,17 +2,21 @@ from __future__ import annotations +import binascii +import datetime import enum import functools import itertools import logging +import os import re +import time from collections.abc import Iterable from dataclasses import dataclass +from hashlib import sha256 +from pathlib import Path from typing import ( TYPE_CHECKING, - Optional, - Union, ) from pip._vendor.packaging import specifiers @@ -21,13 +25,14 @@ from pip._vendor.packaging.version import InvalidVersion, _BaseVersion from pip._vendor.packaging.version import parse as parse_version +from pip._internal.cache import FetchResolveCache from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, InvalidWheelFilename, UnsupportedWheel, ) -from pip._internal.index.collector import LinkCollector, parse_links +from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link @@ -47,14 +52,14 @@ if TYPE_CHECKING: from typing_extensions import TypeGuard + BuildTag = tuple[()] | tuple[int, str] + CandidateSortingKey = tuple[int, int, int, _BaseVersion, int | None, BuildTag] + __all__ = ["FormatControl", "BestCandidateResult", "PackageFinder"] logger = getLogger(__name__) -BuildTag = Union[tuple[()], tuple[int, str]] -CandidateSortingKey = tuple[int, int, int, _BaseVersion, Optional[int], BuildTag] - def _check_link_requires_python( link: Link, @@ -593,6 +598,7 @@ def __init__( format_control: FormatControl | None = None, candidate_prefs: CandidatePreferences | None = None, ignore_requires_python: bool | None = None, + fetch_resolve_cache: FetchResolveCache | None = None, ) -> None: """ This constructor is primarily meant to be used by the create() class @@ -627,6 +633,8 @@ def __init__( BestCandidateResult, ] = {} + self._fetch_resolve_cache = fetch_resolve_cache + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -637,6 +645,7 @@ def create( link_collector: LinkCollector, selection_prefs: SelectionPreferences, target_python: TargetPython | None = None, + fetch_resolve_cache: FetchResolveCache | None = None, ) -> PackageFinder: """Create a PackageFinder. @@ -661,6 +670,7 @@ def create( allow_yanked=selection_prefs.allow_yanked, format_control=selection_prefs.format_control, ignore_requires_python=selection_prefs.ignore_requires_python, + fetch_resolve_cache=fetch_resolve_cache, ) @property @@ -800,18 +810,235 @@ def evaluate_links( return candidates - def process_project_url( + _HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S %Z" + + @classmethod + def _try_load_http_cache_headers( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + headers: dict[str, str], + ) -> tuple[str | None, datetime.datetime | None, bytes | None]: + etag: str | None = None + try: + etag = etag_path.read_text() + etag = f'"{etag}"' + logger.debug( + "found cached etag for url %s at %s: %s", + project_url, + etag_path, + etag, + ) + headers["If-None-Match"] = etag + except OSError as e: + logger.debug("no etag found for url %s (%s)", project_url, str(e)) + + date: datetime.datetime | None = None + try: + date_bytes = date_path.read_bytes() + date_int = int.from_bytes(date_bytes, byteorder="big", signed=False) + date = datetime.datetime.fromtimestamp(date_int, tz=datetime.timezone.utc) + logger.debug( + "found cached date for url %s at %s: '%s'", + project_url, + date_path, + date, + ) + headers["If-Modified-Since"] = date.strftime(cls._HTTP_DATE_FORMAT) + except OSError as e: + logger.debug("no date found for url %s (%s)", project_url, str(e)) + + checksum: bytes | None = None + try: + checksum = checksum_path.read_bytes() + logger.debug( + "found checksum for url %s at %s: '%s'", + project_url, + checksum_path, + binascii.b2a_base64(checksum, newline=False).decode("ascii"), + ) + except OSError as e: + logger.debug("no checksum found for url %s (%s)", project_url, str(e)) + + return (etag, date, checksum) + + _quoted_value = re.compile(r'^"([^"]*)"$') + + @classmethod + def _strip_quoted_value(cls, value: str) -> str: + return cls._quoted_value.sub(r"\1", value) + + _now_local = datetime.datetime.now().astimezone() + _local_tz = _now_local.tzinfo + assert _local_tz is not None + _local_tz_name = _local_tz.tzname(_now_local) + + @classmethod + def _write_http_cache_info( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + index_response: IndexContent, + prev_etag: str | None, + prev_checksum: bytes | None, + ) -> tuple[str | None, datetime.datetime | None, bytes, bool]: + hasher = sha256() + hasher.update(index_response.content) + new_checksum = hasher.digest() + checksum_path.write_bytes(new_checksum) + page_unmodified = new_checksum == prev_checksum + + new_etag: str | None = index_response.etag + if new_etag is None: + logger.debug("no etag returned from fetch for url %s", project_url.url) + try: + etag_path.unlink() + except OSError: + pass + else: + new_etag = cls._strip_quoted_value(new_etag) + if new_etag != prev_etag: + logger.debug( + "etag for url %s updated from %s -> %s", + project_url.url, + prev_etag, + new_etag, + ) + etag_path.write_text(new_etag) + else: + logger.debug( + "etag was unmodified for url %s (%s)", project_url.url, prev_etag + ) + assert page_unmodified + + new_date: datetime.datetime | None = None + date_str: str | None = index_response.date + if date_str is None: + logger.debug( + "no date header was provided in response for url %s", project_url + ) + else: + date_str = date_str.strip() + new_time = time.strptime(date_str, cls._HTTP_DATE_FORMAT) + new_date = datetime.datetime.strptime(date_str, cls._HTTP_DATE_FORMAT) + # strptime() doesn't set the timezone according to the parsed %Z arg, which + # may be any of "UTC", "GMT", or any element of `time.tzname`. + if new_time.tm_zone in ["UTC", "GMT"]: + logger.debug( + "a UTC timezone was found in response for url %s", project_url + ) + new_date = new_date.replace(tzinfo=datetime.timezone.utc) + else: + assert new_time.tm_zone in time.tzname, new_time + logger.debug( + "a local timezone %s was found in response for url %s", + new_time.tm_zone, + project_url, + ) + if new_time.tm_zone == cls._local_tz_name: + new_date = new_date.replace(tzinfo=cls._local_tz) + else: + logger.debug( + "a local timezone %s had to be discarded in response %s", + new_time.tm_zone, + project_url, + ) + new_date = None + + if new_date is not None: + timestamp = new_date.timestamp() + # The timestamp will only have second resolution according to the parse + # format string _HTTP_DATE_FORMAT. + assert not (timestamp % 1), (new_date, timestamp) + epoch = int(timestamp) + assert epoch >= 0, (new_date, timestamp, epoch) + date_bytes = epoch.to_bytes(length=4, byteorder="big", signed=False) + date_path.write_bytes(date_bytes) + + logger.debug('date "%s" written for url %s', new_date, project_url) + if new_date is None: + try: + date_path.unlink() + except OSError: + pass + + return (new_etag, new_date, new_checksum, page_unmodified) + + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> list[InstallationCandidate]: logger.debug( "Fetching project page and analyzing links: %s", project_url, ) + index_response = self._link_collector.fetch_response(project_url) if index_response is None: return [] - page_links = list(parse_links(index_response)) + page_links = parse_links(index_response) + + with indent_log(): + package_links = self.evaluate_links(link_evaluator, links=page_links) + return package_links + + def process_project_url( + self, project_url: Link, link_evaluator: LinkEvaluator + ) -> list[InstallationCandidate]: + if self._fetch_resolve_cache is None: + return self._process_project_url_uncached(project_url, link_evaluator) + + cached_path = self._fetch_resolve_cache.cache_path(project_url) + os.makedirs(str(cached_path), exist_ok=True) + + etag_path = cached_path / "etag" + date_path = cached_path / "modified-since-date" + checksum_path = cached_path / "checksum" + + headers: dict[str, str] = { + # Wipe any other Cache-Control headers away--we are explicitly managing the + # caching here. + "Cache-Control": "", + } + # NB: mutates headers! + prev_etag, _prev_date, prev_checksum = self._try_load_http_cache_headers( + etag_path, date_path, checksum_path, project_url, headers + ) + + logger.debug( + "Fetching project page and analyzing links: %s", + project_url, + ) + + # A 304 Not Modified is implicitly converted into a reused cached response from + # the Cache-Control library, so we won't explicitly check for a 304. + index_response = self._link_collector.fetch_response( + project_url, + headers=headers, + ) + if index_response is None: + return [] + + ( + _new_etag, + _new_date, + _new_checksum, + page_unmodified, + ) = self._write_http_cache_info( + etag_path, + date_path, + checksum_path, + project_url, + index_response, + prev_etag=prev_etag, + prev_checksum=prev_checksum, + ) + + page_links = parse_links(index_response) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/metadata/__init__.py b/src/pip/_internal/metadata/__init__.py index 927e375cad0..368950f9946 100644 --- a/src/pip/_internal/metadata/__init__.py +++ b/src/pip/_internal/metadata/__init__.py @@ -9,7 +9,14 @@ from pip._internal.utils.deprecation import deprecated from pip._internal.utils.misc import strtobool -from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel +from .base import ( + BaseDistribution, + BaseEnvironment, + FilesystemWheel, + MemoryWheel, + Wheel, + serialize_metadata, +) __all__ = [ "BaseDistribution", @@ -21,6 +28,7 @@ "get_environment", "get_wheel_distribution", "select_backend", + "serialize_metadata", ] diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 230e11473c6..4e0d23625cd 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -1,7 +1,10 @@ from __future__ import annotations import csv +import email.generator import email.message +import email.policy +import email.utils import functools import json import logging @@ -11,10 +14,10 @@ from collections.abc import Collection, Container, Iterable, Iterator from typing import ( IO, + TYPE_CHECKING, Any, NamedTuple, Protocol, - Union, ) from pip._vendor.packaging.requirements import Requirement @@ -36,7 +39,8 @@ from ._json import msg_to_json -InfoPath = Union[str, pathlib.PurePath] +if TYPE_CHECKING: + InfoPath = str | pathlib.PurePath logger = logging.getLogger(__name__) @@ -85,6 +89,20 @@ def _convert_installed_files_path( return str(pathlib.Path(*info, *entry)) +def serialize_metadata(msg: email.message.Message) -> str: + """Write a dist's metadata to a string. + + Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives + as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to + avoid those parsing errors.""" + # Packages such as google_pasta-0.2.0 trigger a particular encoding behavior that + # required an upstream fix (https://github.com/python/cpython/pull/117369): + # + # > email.errors.HeaderWriteError: folded header + # > contains newline: 'Description: UNKNOWN\n\n\n' + return msg.as_string(policy=email.policy.HTTP.clone(refold_source="all")) + + class RequiresEntry(NamedTuple): requirement: str extra: str @@ -92,6 +110,15 @@ class RequiresEntry(NamedTuple): class BaseDistribution(Protocol): + @property + def is_concrete(self) -> bool: + """Whether the distribution really exists somewhere on disk. + + If this is false, it has been synthesized from metadata, e.g. via + ``.from_metadata_file_contents()``, or ``.from_wheel()`` against + a ``MemoryWheel``.""" + raise NotImplementedError() + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: """Load the distribution from a metadata directory. @@ -664,6 +691,10 @@ def iter_installed_distributions( class Wheel(Protocol): location: str + @property + def is_concrete(self) -> bool: + raise NotImplementedError() + def as_zipfile(self) -> zipfile.ZipFile: raise NotImplementedError() @@ -672,6 +703,10 @@ class FilesystemWheel(Wheel): def __init__(self, location: str) -> None: self.location = location + @property + def is_concrete(self) -> bool: + return True + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.location, allowZip64=True) @@ -681,5 +716,9 @@ def __init__(self, location: str, stream: IO[bytes]) -> None: self.location = location self.stream = stream + @property + def is_concrete(self) -> bool: + return False + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.stream, allowZip64=True) diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index 97363af9a55..9bd3e0766fb 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -7,6 +7,7 @@ from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence from os import PathLike from typing import ( + TYPE_CHECKING, cast, ) @@ -19,7 +20,6 @@ from pip._internal.metadata.base import ( BaseDistribution, BaseEntryPoint, - InfoPath, Wheel, ) from pip._internal.utils.misc import normalize_path @@ -33,6 +33,9 @@ parse_name_and_version_from_info_directory, ) +if TYPE_CHECKING: + from pip._internal.metadata.base import InfoPath + class WheelDistribution(importlib.metadata.Distribution): """An ``importlib.metadata.Distribution`` read from a wheel. @@ -104,16 +107,22 @@ def __init__( dist: importlib.metadata.Distribution, info_location: BasePath | None, installed_location: BasePath | None, + concrete: bool, ) -> None: self._dist = dist self._info_location = info_location self._installed_location = installed_location + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: info_location = pathlib.Path(directory) dist = importlib.metadata.Distribution.at(info_location) - return cls(dist, info_location, info_location.parent) + return cls(dist, info_location, info_location.parent, concrete=True) @classmethod def from_metadata_file_contents( @@ -130,7 +139,7 @@ def from_metadata_file_contents( metadata_path.write_bytes(metadata_contents) # Construct dist pointing to the newly created directory. dist = importlib.metadata.Distribution.at(metadata_path.parent) - return cls(dist, metadata_path.parent, None) + return cls(dist, metadata_path.parent, None, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -139,7 +148,14 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: dist = WheelDistribution.from_zipfile(zf, name, wheel.location) except zipfile.BadZipFile as e: raise InvalidWheel(wheel.location, name) from e - return cls(dist, dist.info_location, pathlib.PurePosixPath(wheel.location)) + except UnsupportedWheel as e: + raise UnsupportedWheel(f"{name} has an invalid wheel, {e}") + return cls( + dist, + dist.info_location, + pathlib.PurePosixPath(wheel.location), + concrete=wheel.is_concrete, + ) @property def location(self) -> str | None: diff --git a/src/pip/_internal/metadata/importlib/_envs.py b/src/pip/_internal/metadata/importlib/_envs.py index 71a73b7311f..0aeade7857d 100644 --- a/src/pip/_internal/metadata/importlib/_envs.py +++ b/src/pip/_internal/metadata/importlib/_envs.py @@ -86,7 +86,7 @@ def find(self, location: str) -> Iterator[BaseDistribution]: installed_location: BasePath | None = None else: installed_location = info_location.parent - yield Distribution(dist, info_location, installed_location) + yield Distribution(dist, info_location, installed_location, concrete=True) def find_legacy_editables(self, location: str) -> Iterator[BaseDistribution]: """Read location in egg-link files and return distributions in there. @@ -110,7 +110,7 @@ def find_legacy_editables(self, location: str) -> Iterator[BaseDistribution]: continue target_location = str(path.joinpath(target_rel)) for dist, info_location in self._find_impl(target_location): - yield Distribution(dist, info_location, path) + yield Distribution(dist, info_location, path, concrete=True) class Environment(BaseEnvironment): diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index 89fce8b6e5d..0158972cd3d 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -7,6 +7,7 @@ import zipfile from collections.abc import Collection, Iterable, Iterator, Mapping from typing import ( + TYPE_CHECKING, NamedTuple, ) @@ -25,10 +26,12 @@ BaseDistribution, BaseEntryPoint, BaseEnvironment, - InfoPath, Wheel, ) +if TYPE_CHECKING: + from .base import InfoPath + __all__ = ["NAME", "Distribution", "Environment"] logger = logging.getLogger(__name__) @@ -78,8 +81,9 @@ def run_script(self, script_name: str, namespace: str) -> None: class Distribution(BaseDistribution): - def __init__(self, dist: pkg_resources.Distribution) -> None: + def __init__(self, dist: pkg_resources.Distribution, concrete: bool) -> None: self._dist = dist + self._concrete = concrete # This is populated lazily, to avoid loading metadata for all possible # distributions eagerly. self.__extra_mapping: Mapping[NormalizedName, str] | None = None @@ -93,6 +97,10 @@ def _extra_mapping(self) -> Mapping[NormalizedName, str]: return self.__extra_mapping + @property + def is_concrete(self) -> bool: + return self._concrete + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: dist_dir = directory.rstrip(os.sep) @@ -111,7 +119,7 @@ def from_directory(cls, directory: str) -> BaseDistribution: dist_name = os.path.splitext(dist_dir_name)[0].split("-")[0] dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata) - return cls(dist) + return cls(dist, concrete=True) @classmethod def from_metadata_file_contents( @@ -128,7 +136,7 @@ def from_metadata_file_contents( metadata=InMemoryMetadata(metadata_dict, filename), project_name=project_name, ) - return cls(dist) + return cls(dist, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -149,7 +157,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: metadata=InMemoryMetadata(metadata_dict, wheel.location), project_name=name, ) - return cls(dist) + return cls(dist, concrete=wheel.is_concrete) @property def location(self) -> str | None: @@ -261,7 +269,7 @@ def from_paths(cls, paths: list[str] | None) -> BaseEnvironment: def _iter_distributions(self) -> Iterator[BaseDistribution]: for dist in self._ws: - yield Distribution(dist) + yield Distribution(dist, concrete=True) def _search_distribution(self, name: str) -> BaseDistribution | None: """Find a distribution matching the ``name`` in the environment. diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2e2c0f836ac..42c948cd955 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -281,6 +281,7 @@ def from_json( cls, file_data: dict[str, Any], page_url: str, + page_content: IndexContent | None = None, ) -> Link | None: """ Convert an pypi json document from a simple repository page into a Link. @@ -320,7 +321,7 @@ def from_json( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, hashes=hashes, @@ -333,6 +334,7 @@ def from_element( anchor_attribs: dict[str, str | None], page_url: str, base_url: str, + page_content: IndexContent | None = None, ) -> Link | None: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -373,7 +375,7 @@ def from_element( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, diff --git a/src/pip/_internal/operations/check.py b/src/pip/_internal/operations/check.py index 2d71fa5fff5..1ceca5c4f57 100644 --- a/src/pip/_internal/operations/check.py +++ b/src/pip/_internal/operations/check.py @@ -17,7 +17,6 @@ from pip._vendor.packaging.utils import NormalizedName, canonicalize_name from pip._vendor.packaging.version import Version -from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.metadata import get_default_environment from pip._internal.metadata.base import BaseDistribution from pip._internal.req.req_install import InstallRequirement @@ -148,8 +147,8 @@ def _simulate_installation_of( # Modify it as installing requirement_set would (assuming no errors) for inst_req in to_install: - abstract_dist = make_distribution_for_install_requirement(inst_req) - dist = abstract_dist.get_metadata_distribution() + assert inst_req.is_concrete + dist = inst_req.get_dist() name = dist.canonical_name package_set[name] = PackageDetails(dist.version, list(dist.iter_dependencies())) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 00b1a33a030..c037ba242b5 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -4,6 +4,8 @@ # mypy: strict-optional=False from __future__ import annotations +import bz2 +import json import mimetypes import os import shutil @@ -13,11 +15,13 @@ from typing import TYPE_CHECKING from pip._vendor.packaging.utils import canonicalize_name +from pip._vendor.requests.exceptions import InvalidSchema from pip._internal.build_env import BuildEnvironmentInstaller +from pip._internal.cache import LinkMetadataCache, should_cache from pip._internal.distributions import make_distribution_for_install_requirement -from pip._internal.distributions.installed import InstalledDistribution from pip._internal.exceptions import ( + CacheMetadataError, DirectoryUrlHashUnsupported, HashMismatch, HashUnpinned, @@ -27,7 +31,11 @@ VcsHashUnsupported, ) from pip._internal.index.package_finder import PackageFinder -from pip._internal.metadata import BaseDistribution, get_metadata_distribution +from pip._internal.metadata import ( + BaseDistribution, + get_metadata_distribution, + serialize_metadata, +) from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel @@ -68,16 +76,31 @@ def _get_prepared_distribution( build_env_installer: BuildEnvironmentInstaller, build_isolation: bool, check_build_deps: bool, -) -> BaseDistribution: - """Prepare a distribution for installation.""" +) -> tuple[bool, BaseDistribution]: + """Prepare a distribution for installation. + + This method will only be called by the preparer at the end of the resolve, and only + for commands which need installable artifacts (not just resolved metadata). If the + dist was previously metadata-only, the preparer must have downloaded the file + corresponding to the dist and set ``req.local_file_path``. + + This method will execute ``req.cache_concrete_dist()``, so that after invocation, + ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete + ``Distribution``. + + :returns: a 2-tuple: + - (bool): whether the metadata had to be constructed (e.g. from an sdist build), + - (BaseDistribution): the concrete distribution which is ready to be installed. + """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id - if tracker_id is not None: + builds_metadata = tracker_id is not None + if builds_metadata: with build_tracker.track(req, tracker_id): abstract_dist.prepare_distribution_metadata( build_env_installer, build_isolation, check_build_deps ) - return abstract_dist.get_metadata_distribution() + return (builds_metadata, abstract_dist.get_metadata_distribution()) def unpack_vcs_link(link: Link, location: str, verbosity: int) -> None: @@ -200,6 +223,8 @@ def _check_download_dir( ) -> str | None: """Check download_dir for previously downloaded file with correct hash If a correct file is found return its path else None + + If a file is found at the given path, but with an invalid hash, the file is deleted. """ download_path = os.path.join(download_dir, link.filename) @@ -222,6 +247,45 @@ def _check_download_dir( return download_path +@dataclass(frozen=True) +class CacheableDist: + metadata: str + filename: Path + canonical_name: str + + @classmethod + def from_dist(cls, link: Link, dist: BaseDistribution) -> CacheableDist: + """Extract the serializable data necessary to generate a metadata-only dist.""" + return cls( + metadata=serialize_metadata(dist.metadata), + filename=Path(link.filename), + canonical_name=dist.canonical_name, + ) + + def to_dist(self) -> BaseDistribution: + """Return a metadata-only dist from the deserialized cache entry.""" + return get_metadata_distribution( + metadata_contents=self.metadata.encode("utf-8"), + filename=str(self.filename), + canonical_name=self.canonical_name, + ) + + def to_json(self) -> dict[str, str]: + return { + "metadata": self.metadata, + "filename": str(self.filename), + "canonical_name": self.canonical_name, + } + + @classmethod + def from_json(cls, args: dict[str, str]) -> CacheableDist: + return cls( + metadata=args["metadata"], + filename=Path(args["filename"]), + canonical_name=args["canonical_name"], + ) + + class RequirementPreparer: """Prepares a Requirement""" @@ -244,6 +308,7 @@ def __init__( # noqa: PLR0913 (too many parameters) verbosity: int, legacy_resolver: bool, resume_retries: int, + metadata_cache: LinkMetadataCache | None = None, ) -> None: super().__init__() @@ -286,6 +351,8 @@ def __init__( # noqa: PLR0913 (too many parameters) # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + self._metadata_cache = metadata_cache + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -317,11 +384,7 @@ def _ensure_link_req_src_dir( self, req: InstallRequirement, parallel_builds: bool ) -> None: """Ensure source_dir of a linked InstallRequirement.""" - # Since source_dir is only set for editable requirements. - if req.link.is_wheel: - # We don't need to unpack wheels, so no need for a source - # directory. - return + assert not req.link.is_wheel assert req.source_dir is None if req.link.is_existing_dir(): # build local directories in-tree @@ -368,6 +431,47 @@ def _get_linked_req_hashes(self, req: InstallRequirement) -> Hashes: # showing the user what the hash should be. return req.hashes(trust_internet=False) or MissingHashes() + def _rewrite_link_and_hashes_for_cached_wheel( + self, req: InstallRequirement, hashes: Hashes + ) -> Hashes | None: + """Check the hash of the requirement's source eagerly and rewrite its link. + + ``req.link`` is unconditionally rewritten to the cached wheel source so that the + requirement corresponds to where it was actually downloaded from instead of the + local cache entry. + + :returns: None if the source hash validated successfully. + """ + assert hashes + assert req.is_wheel_from_cache + assert req.download_info is not None + assert req.link.is_wheel + assert req.link.is_file + + # TODO: is it possible to avoid providing a "wrong" req.link in the first place + # in the resolver, instead of having to patch it up afterwards? + req.link = req.cached_wheel_source_link + + # We need to verify hashes, and we have found the requirement in the cache + # of locally built wheels. + if ( + isinstance(req.download_info.info, ArchiveInfo) + and req.download_info.info.hashes + and hashes.has_one_of(req.download_info.info.hashes) + ): + # At this point we know the requirement was built from a hashable source + # artifact, and we verified that the cache entry's hash of the original + # artifact matches one of the hashes we expect. We don't verify hashes + # against the cached wheel, because the wheel is not the original. + return None + + logger.warning( + "The hashes of the source archive found in cache entry " + "don't match, ignoring cached built wheel " + "and re-downloading source." + ) + return hashes + def _fetch_metadata_only( self, req: InstallRequirement, @@ -378,14 +482,96 @@ def _fetch_metadata_only( ) return None if self.require_hashes: + # Hash checking also means hashes are provided for all reqs, so no resolve + # is necessary and metadata-only fetching provides no speedup. logger.debug( "Metadata-only fetching is not used as hash checking is required", ) return None - # Try PEP 658 metadata first, then fall back to lazy wheel if unavailable. - return self._fetch_metadata_using_link_data_attr( - req - ) or self._fetch_metadata_using_lazy_wheel(req.link) + + if cached_dist := self._fetch_cached_metadata(req): + return cached_dist + # If we've used the lazy wheel approach, then PEP 658 metadata is not available. + # If the wheel is very large (>1GB), then retrieving it from the CacheControl + # HTTP cache may take multiple seconds, even on a fast computer, and the + # preparer will unnecessarily copy the cached response to disk before deleting + # it at the end of the run. Caching the dist metadata in LinkMetadataCache means + # later pip executions can retrieve metadata within milliseconds and avoid + # thrashing the disk. + # Even if we do employ PEP 658 metadata, we would still have to ping PyPI to + # ensure the .metadata file hasn't changed if we relied on CacheControl, even + # though PEP 658 metadata is guaranteed to be immutable. We optimize for this + # case by referring to our local cache. + if cached_dist := ( + self._fetch_metadata_using_link_data_attr(req) + or self._fetch_metadata_using_lazy_wheel(req) + ): + self._cache_metadata(req, cached_dist) + return cached_dist + return None + + def _locate_metadata_cache_entry(self, link: Link) -> Path | None: + """If the metadata cache is active, generate a filesystem path from the hash of + the given Link.""" + if self._metadata_cache is None: + return None + + return self._metadata_cache.cache_path(link) + + def _fetch_cached_metadata( + self, req: InstallRequirement + ) -> BaseDistribution | None: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return None + + # Quietly continue if the cache entry does not exist. + if not os.path.isfile(cached_path): + logger.debug( + "no cached metadata for link %s at %s", + req.link, + cached_path, + ) + return None + + try: + with bz2.open(cached_path, mode="rt", encoding="utf-8") as f: + logger.debug( + "found cached metadata for link %s at %s", req.link, cached_path + ) + args = json.load(f) + cached_dist = CacheableDist.from_json(args) + return cached_dist.to_dist() + except Exception: + raise CacheMetadataError(req, "error reading cached metadata") + + def _cache_metadata( + self, + req: InstallRequirement, + metadata_dist: BaseDistribution, + ) -> None: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return + + # The cache file exists already, so we have nothing to do. + if os.path.isfile(cached_path): + logger.debug( + "metadata for link %s is already cached at %s", req.link, cached_path + ) + return + + # The metadata cache is split across several subdirectories, so ensure the + # containing directory for the cache file exists before writing. + os.makedirs(str(cached_path.parent), exist_ok=True) + try: + cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) + args = cacheable_dist.to_json() + logger.debug("caching metadata for link %s at %s", req.link, cached_path) + with bz2.open(cached_path, mode="wt", encoding="utf-8") as f: + json.dump(args, f) + except Exception: + raise CacheMetadataError(req, "failed to serialize metadata") def _fetch_metadata_using_link_data_attr( self, @@ -403,6 +589,9 @@ def _fetch_metadata_using_link_data_attr( metadata_link, ) # (2) Download the contents of the METADATA file, separate from the dist itself. + # NB: this request will hit the CacheControl HTTP cache, which will be very + # quick since the METADATA file is very small. Therefore, we can rely on + # HTTP caching instead of LinkMetadataCache. metadata_file = get_http_url( metadata_link, self._download, @@ -430,36 +619,38 @@ def _fetch_metadata_using_link_data_attr( def _fetch_metadata_using_lazy_wheel( self, - link: Link, + req: InstallRequirement, ) -> BaseDistribution | None: """Fetch metadata using lazy wheel, if possible.""" # --use-feature=fast-deps must be provided. if not self.use_lazy_wheel: return None - if link.is_file or not link.is_wheel: + if req.link.is_file or not req.link.is_wheel: logger.debug( "Lazy wheel is not used as %r does not point to a remote wheel", - link, + req.link, ) return None - wheel = Wheel(link.filename) + wheel = Wheel(req.link.filename) name = canonicalize_name(wheel.name) logger.info( "Obtaining dependency information from %s %s", name, wheel.version, ) - url = link.url.split("#", 1)[0] + try: - return dist_from_wheel_url(name, url, self._session) + return dist_from_wheel_url( + name, req.link.url_without_fragment, self._session + ) except HTTPRangeRequestUnsupported: - logger.debug("%s does not support range requests", url) + logger.debug("%s does not support range requests", req.link) return None def _complete_partial_requirements( self, - partially_downloaded_reqs: Iterable[InstallRequirement], + metadata_only_reqs: list[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -471,9 +662,23 @@ def _complete_partial_requirements( # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. links_to_fully_download: dict[Link, InstallRequirement] = {} - for req in partially_downloaded_reqs: + for req in metadata_only_reqs: assert req.link - links_to_fully_download[req.link] = req + + # (1) File URLs don't need to be downloaded, so skip them. + if req.link.scheme == "file": + continue + # (2) If this is e.g. a git url, we don't know how to handle that in the + # BatchDownloader, so leave it for self._prepare_linked_requirement() at + # the end of this method, which knows how to handle any URL. + can_simply_download = True + try: + # This will raise InvalidSchema if our Session can't download it. + self._session.get_adapter(req.link.url) + except InvalidSchema: + can_simply_download = False + if can_simply_download: + links_to_fully_download[req.link] = req batch_download = self._download.batch(links_to_fully_download.keys(), temp_dir) for link, (filepath, _) in batch_download: @@ -495,9 +700,33 @@ def _complete_partial_requirements( # This step is necessary to ensure all lazy wheels are processed # successfully by the 'download', 'wheel', and 'install' commands. - for req in partially_downloaded_reqs: + for req in metadata_only_reqs: self._prepare_linked_requirement(req, parallel_builds) + def _check_download_path(self, req: InstallRequirement) -> None: + """Check if the relevant file is already available in the download directory. + + If so, check its hash, and delete the file if the hash doesn't match.""" + if self.download_dir is None: + return + if not req.link.is_wheel: + return + + hashes = self._get_linked_req_hashes(req) + if file_path := _check_download_dir( + req.link, + self.download_dir, + hashes, + # When a locally built wheel has been found in cache, we don't warn + # about re-downloading when the already downloaded wheel hash does + # not match. This is because the hash must be checked against the + # original link, not the cached link. It that case the already + # downloaded file will be removed and re-fetched from cache (which + # implies a hash check against the cache entry's origin.json). + warn_on_hash_mismatch=not req.is_wheel_from_cache, + ): + self._downloaded[req.link.url] = file_path + def prepare_linked_requirement( self, req: InstallRequirement, parallel_builds: bool = False ) -> BaseDistribution: @@ -505,108 +734,109 @@ def prepare_linked_requirement( assert req.link self._log_preparing_link(req) with indent_log(): - # Check if the relevant file is already available - # in the download directory - file_path = None - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir( - req.link, - self.download_dir, - hashes, - # When a locally built wheel has been found in cache, we don't warn - # about re-downloading when the already downloaded wheel hash does - # not match. This is because the hash must be checked against the - # original link, not the cached link. It that case the already - # downloaded file will be removed and re-fetched from cache (which - # implies a hash check against the cache entry's origin.json). - warn_on_hash_mismatch=not req.is_wheel_from_cache, - ) + # See if the file is already downloaded, and check its hash if so. + self._check_download_path(req) - if file_path is not None: - # The file is already available, so mark it as downloaded - self._downloaded[req.link.url] = file_path - else: - # The file is not available, attempt to fetch only metadata - metadata_dist = self._fetch_metadata_only(req) - if metadata_dist is not None: - req.needs_more_preparation = True - return metadata_dist + # First try to fetch only metadata. + if metadata_dist := self._fetch_metadata_only(req): + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) + return metadata_dist # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) - def prepare_linked_requirements_more( - self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False - ) -> None: - """Prepare linked requirements more, if needed.""" - reqs = [req for req in reqs if req.needs_more_preparation] + def _extract_download_info(self, reqs: Iterable[InstallRequirement]) -> None: + """ + `pip install --report` extracts the download info from each requirement for its + JSON output, so we need to make sure every requirement has this before finishing + the resolve. But .download_info will only be populated by the point this method + is called for requirements already found in the wheel cache, so we need to + synthesize it for uncached results. Luckily, a DirectUrl can be parsed directly + from a url without any other context. However, this also means the download info + will only contain a hash if the link itself declares the hash. + """ for req in reqs: - # Determine if any of these requirements were already downloaded. - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir(req.link, self.download_dir, hashes) - if file_path is not None: - self._downloaded[req.link.url] = file_path - req.needs_more_preparation = False - - # Prepare requirements we found were already downloaded for some - # reason. The other downloads will be completed separately. - partially_downloaded_reqs: list[InstallRequirement] = [] + if req.download_info is None: + self._ensure_download_info(req) + + def _force_fully_prepared( + self, reqs: Iterable[InstallRequirement], assert_has_dist_files: bool + ) -> None: + """ + The legacy resolver seems to prepare requirements differently that can leave + them half-done in certain code paths. I'm not quite sure how it's doing things, + but at least we can do this to make sure they do things right. + """ for req in reqs: - if req.needs_more_preparation: - partially_downloaded_reqs.append(req) - else: - self._prepare_linked_requirement(req, parallel_builds) + req.prepared = True + if assert_has_dist_files: + assert req.is_concrete - # TODO: separate this part out from RequirementPreparer when the v1 - # resolver can be removed! + def _ensure_dist_files( + self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False + ) -> None: + """Download any metadata-only linked requirements.""" + metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( - partially_downloaded_reqs, + metadata_only_reqs, parallel_builds=parallel_builds, ) - def _prepare_linked_requirement( - self, req: InstallRequirement, parallel_builds: bool - ) -> BaseDistribution: - assert req.link - link = req.link - - hashes = self._get_linked_req_hashes(req) + def finalize_linked_requirements( + self, + reqs: Iterable[InstallRequirement], + require_dist_files: bool, + parallel_builds: bool = False, + ) -> None: + """Prepare linked requirements more, if needed. + + Neighboring .metadata files as per PEP 658 or lazy wheels via fast-deps will be + preferred to extract metadata from any concrete requirement (one that has been + mapped to a Link) without downloading the underlying wheel or sdist. When ``pip + install --dry-run`` is called, we want to avoid ever downloading the underlying + dist, but we still need to provide all of the results that pip commands expect + from the typical resolve process. + + Those expectations vary, but one distinction lies in whether the command needs + an actual physical dist somewhere on the filesystem, or just the metadata about + it from the resolver (as in ``pip install --report``). If the command requires + actual physical filesystem locations for the resolved dists, it must call this + method with ``require_dist_files=True`` to fully download anything + that remains. + """ + if require_dist_files: + self._ensure_dist_files(reqs, parallel_builds=parallel_builds) + else: + self._extract_download_info(reqs) + self._force_fully_prepared(reqs, assert_has_dist_files=require_dist_files) - if hashes and req.is_wheel_from_cache: - assert req.download_info is not None - assert link.is_wheel - assert link.is_file - # We need to verify hashes, and we have found the requirement in the cache - # of locally built wheels. - if ( - isinstance(req.download_info.info, ArchiveInfo) - and req.download_info.info.hashes - and hashes.has_one_of(req.download_info.info.hashes) - ): - # At this point we know the requirement was built from a hashable source - # artifact, and we verified that the cache entry's hash of the original - # artifact matches one of the hashes we expect. We don't verify hashes - # against the cached wheel, because the wheel is not the original. - hashes = None - else: - logger.warning( - "The hashes of the source archive found in cache entry " - "don't match, ignoring cached built wheel " - "and re-downloading source." - ) - req.link = req.cached_wheel_source_link - link = req.link + def _ensure_local_file_path( + self, req: InstallRequirement, hashes: Hashes | None + ) -> None: + """Ensure that ``req.link`` is downloaded locally, matches the expected hash, + and that ``req.local_file_path`` is set to the download location.""" + if req.link.is_existing_dir(): + return - self._ensure_link_req_src_dir(req, parallel_builds) + # NB: req.local_file_path may be set already, if it was: + # (1) sourced from a local file (such as a local .tar.gz path), + # (2) also in the wheel cache (e.g. built from an sdist). + # We will overwrite it if so, since the local file path will still point to the + # .tar.gz source instead of the wheel cache entry. - if link.is_existing_dir(): - local_file = None - elif link.url not in self._downloaded: + local_file: File | None = None + # The file may have already been downloaded in batch if it was + # a metadata-only requirement, or if it was already in the download directory. + if file_path := self._downloaded.get(req.link.url, None): + if hashes: + hashes.check_against_path(file_path) + local_file = File(file_path, content_type=None) + else: try: local_file = unpack_url( - link, + req.link, req.source_dir, self._download, self.verbosity, @@ -616,51 +846,82 @@ def _prepare_linked_requirement( except NetworkConnectionError as exc: raise InstallationError( f"Could not install requirement {req} because of HTTP " - f"error {exc} for URL {link}" - ) - else: - file_path = self._downloaded[link.url] - if hashes: - hashes.check_against_path(file_path) - local_file = File(file_path, content_type=None) + f"error {exc} for URL {req.link}" + ) from exc - # If download_info is set, we got it from the wheel cache. - if req.download_info is None: - # Editables don't go through this function (see - # prepare_editable_requirement). - assert not req.editable - req.download_info = direct_url_from_link(link, req.source_dir) - # Make sure we have a hash in download_info. If we got it as part of the - # URL, it will have been verified and we can rely on it. Otherwise we - # compute it from the downloaded file. - # FIXME: https://github.com/pypa/pip/issues/11943 - if ( - isinstance(req.download_info.info, ArchiveInfo) - and not req.download_info.info.hashes - and local_file - ): - hash = hash_file(local_file.path)[0].hexdigest() - # We populate info.hash for backward compatibility. - # This will automatically populate info.hashes. - req.download_info.info.hash = f"sha256={hash}" - - # For use in later processing, - # preserve the file path on the requirement. - if local_file: + if local_file is not None: req.local_file_path = local_file.path - dist = _get_prepared_distribution( + def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: + (builds_metadata, dist) = _get_prepared_distribution( req, self.build_tracker, self.build_env_installer, self.build_isolation, self.check_build_deps, ) + assert dist.is_concrete + assert req.is_concrete + assert req.get_dist() is dist + + if builds_metadata and should_cache(req): + self._cache_metadata(req, dist) + return dist + def _prepare_linked_requirement( + self, req: InstallRequirement, parallel_builds: bool + ) -> BaseDistribution: + """Ensure the dist pointing to the fully-resolved requirement is downloaded and + installable.""" + assert req.link, "this requirement must have a download link to fully prepare" + + hashes: Hashes | None = self._get_linked_req_hashes(req) + + if hashes and req.is_wheel_from_cache: + hashes = self._rewrite_link_and_hashes_for_cached_wheel(req, hashes) + + # req.source_dir is only set for editable requirements. We don't need to unpack + # wheels, so no need for a source directory. + if not req.link.is_wheel: + self._ensure_link_req_src_dir(req, parallel_builds) + + # Ensure the dist is downloaded, check its hash, and unpack it into the source + # directory (if applicable). + self._ensure_local_file_path(req, hashes) + + # Set req.download_info for --report output. + if req.download_info is None: + # If download_info is set, we got it from the wheel cache. + self._ensure_download_info(req) + + # Build (if necessary) and prepare the distribution for installation. + return self._prepare_and_finalize_dist(req) + + def _ensure_download_info(self, req: InstallRequirement) -> None: + """Ensure that ``req.download_info`` is set, for uses such as --report.""" + assert req.download_info is None + # Editables don't go through this function (see prepare_editable_requirement). + assert not req.editable + req.download_info = direct_url_from_link(req.link, req.source_dir) + # Make sure we have a hash in download_info. If we got it as part of the + # URL, it will have been verified and we can rely on it. Otherwise we + # compute it from the downloaded file. + # FIXME: https://github.com/pypa/pip/issues/11943 + if ( + isinstance(req.download_info.info, ArchiveInfo) + and not req.download_info.info.hashes + and req.local_file_path + ): + hash = hash_file(req.local_file_path)[0].hexdigest() + # We populate info.hash for backward compatibility. + # This will automatically populate info.hashes. + req.download_info.info.hash = f"sha256={hash}" + def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None + assert req.is_concrete link = req.link if link.is_vcs or (link.is_existing_dir() and req.editable): # Make a .zip of the source_dir we already created. @@ -704,18 +965,8 @@ def prepare_editable_requirement( req.update_editable() assert req.source_dir req.download_info = direct_url_for_editable(req.unpacked_source_directory) - - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.build_env_installer, - self.build_isolation, - self.check_build_deps, - ) - req.check_if_exists(self.use_user_site) - - return dist + return self._prepare_and_finalize_dist(req) def prepare_installed_requirement( self, @@ -739,4 +990,4 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - return InstalledDistribution(req).get_metadata_distribution() + return self._prepare_and_finalize_dist(req) diff --git a/src/pip/_internal/req/constructors.py b/src/pip/_internal/req/constructors.py index 056e7e3a7f1..121b75f0df2 100644 --- a/src/pip/_internal/req/constructors.py +++ b/src/pip/_internal/req/constructors.py @@ -554,7 +554,7 @@ def install_req_extend_extras( """ result = copy.copy(ireq) result.extras = {*ireq.extras, *extras} - result.req = ( + result._req = ( _set_requirement_extras(ireq.req, result.extras) if ireq.req is not None else None diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index c9f6bff17e8..378fd63f33b 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -10,7 +10,7 @@ from collections.abc import Collection, Iterable, Sequence from optparse import Values from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING from pip._vendor.packaging.markers import Marker from pip._vendor.packaging.requirements import Requirement @@ -26,10 +26,7 @@ from pip._internal.metadata import ( BaseDistribution, get_default_environment, - get_directory_distribution, - get_wheel_distribution, ) -from pip._internal.metadata.base import FilesystemWheel from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.operations.build.metadata import generate_metadata @@ -62,6 +59,9 @@ from pip._internal.utils.virtualenv import running_under_virtualenv from pip._internal.vcs import vcs +if TYPE_CHECKING: + import email.message + logger = logging.getLogger(__name__) @@ -91,7 +91,7 @@ def __init__( permit_editable_wheels: bool = False, ) -> None: assert req is None or isinstance(req, Requirement), req - self.req = req + self._req = req self.comes_from = comes_from self.constraint = constraint self.editable = editable @@ -153,6 +153,7 @@ def __init__( self.hash_options = hash_options if hash_options else {} self.config_settings = config_settings # Set to True after successful preparation of this requirement + # TODO: this is only used in the legacy resolver: remove this! self.prepared = False # User supplied requirement are explicitly requested for installation # by the user via CLI arguments or requirements files, as opposed to, @@ -194,12 +195,32 @@ def __init__( ) self.use_pep517 = True - # This requirement needs more preparation before it can be built - self.needs_more_preparation = False + # When a dist is computed for this requirement, cache it here so it's visible + # everywhere within pip and isn't computed more than once. This may be + # a "virtual" dist without a physical location on the filesystem, or + # a "concrete" dist which has been fully downloaded. + self._dist: BaseDistribution | None = None # This requirement needs to be unpacked before it can be installed. self._archive_source: Path | None = None + @property + def req(self) -> Requirement | None: + """Calculate a requirement from the cached dist, if populated. + + The cached dist can be populated by either + ``self.cache_virtual_metadata_only_dist()`` or + ``self.cache_concrete_dist()`` and can also be retrieved with + ``self.get_dist()``.""" + if self._req is not None: + return self._req + if self._dist is not None: + name = self._dist.canonical_name + version = str(self._dist.version) + self._req = Requirement(f"{name}=={version}") + return self._req + return None + def __str__(self) -> str: if self.req: s = redact_auth_from_requirement(self.req) @@ -389,7 +410,7 @@ def ensure_build_location( def _set_requirement(self) -> None: """Set requirement after generating metadata.""" - assert self.req is None + assert self._req is None assert self.metadata is not None assert self.source_dir is not None @@ -399,7 +420,7 @@ def _set_requirement(self) -> None: else: op = "===" - self.req = get_requirement( + self._req = get_requirement( "".join( [ self.metadata["Name"], @@ -425,7 +446,7 @@ def warn_on_mismatching_name(self) -> None: metadata_name, self.name, ) - self.req = get_requirement(metadata_name) + self._req = get_requirement(metadata_name) def check_if_exists(self, use_user_site: bool) -> None: """Find an installed distribution that satisfies or conflicts @@ -553,11 +574,11 @@ def isolated_editable_sanity_check(self) -> None: f"Consider using a build backend that supports PEP 660." ) - def prepare_metadata(self) -> None: + def prepare_metadata_directory(self) -> None: """Ensure that project metadata is available. - Under PEP 517 and PEP 660, call the backend hook to prepare the metadata. - Under legacy processing, call setup.py egg-info. + Under PEP 517 and PEP 660, call the backend hook to prepare the metadata + directory. Under legacy processing, call setup.py egg-info. """ assert self.source_dir, f"No source dir for {self}" details = self.name or f"from {self.link}" @@ -589,6 +610,8 @@ def prepare_metadata(self) -> None: details=details, ) + def validate_sdist_metadata(self) -> None: + """Ensure that we have a dist, and ensure it corresponds to expectations.""" # Act on the newly generated metadata, based on the name and version. if not self.name: self._set_requirement() @@ -598,25 +621,51 @@ def prepare_metadata(self) -> None: self.assert_source_matches_version() @property - def metadata(self) -> Any: - if not hasattr(self, "_metadata"): - self._metadata = self.get_dist().metadata - - return self._metadata + def metadata(self) -> email.message.Message: + return self.get_dist().metadata def get_dist(self) -> BaseDistribution: - if self.metadata_directory: - return get_directory_distribution(self.metadata_directory) - elif self.local_file_path and self.is_wheel: - assert self.req is not None - return get_wheel_distribution( - FilesystemWheel(self.local_file_path), - canonicalize_name(self.req.name), - ) - raise AssertionError( - f"InstallRequirement {self} has no metadata directory and no wheel: " - f"can't make a distribution." - ) + """Retrieve the dist resolved from this requirement. + + :raises AssertionError: if the resolver has not yet been executed. + """ + if self._dist is None: + raise AssertionError(f"{self!r} has no dist associated.") + return self._dist + + def cache_virtual_metadata_only_dist(self, dist: BaseDistribution) -> None: + """Associate a "virtual" metadata-only dist to this requirement. + + This dist cannot be installed, but it can be used to complete the resolve + process. + + :raises AssertionError: if a dist has already been associated. + :raises AssertionError: if the provided dist is "concrete", i.e. exists + somewhere on the filesystem. + """ + assert self._dist is None, self + assert not dist.is_concrete, dist + self._dist = dist + + def cache_concrete_dist(self, dist: BaseDistribution) -> None: + """Associate a "concrete" dist to this requirement. + + A concrete dist exists somewhere on the filesystem and can be installed. + + :raises AssertionError: if a concrete dist has already been associated. + :raises AssertionError: if the provided dist is not concrete. + """ + if self._dist is not None: + # If we set a dist twice for the same requirement, we must be hydrating + # a concrete dist for what was previously virtual. This will occur in the + # case of `install --dry-run` when PEP 658 metadata is available. + assert not self._dist.is_concrete + assert dist.is_concrete + self._dist = dist + + @property + def is_concrete(self) -> bool: + return self._dist is not None and self._dist.is_concrete def assert_source_matches_version(self) -> None: assert self.source_dir, f"No source dir for {self}" diff --git a/src/pip/_internal/req/req_set.py b/src/pip/_internal/req/req_set.py index 3451b24f27b..8db9b89d85f 100644 --- a/src/pip/_internal/req/req_set.py +++ b/src/pip/_internal/req/req_set.py @@ -42,7 +42,7 @@ def add_unnamed_requirement(self, install_req: InstallRequirement) -> None: self.unnamed_requirements.append(install_req) def add_named_requirement(self, install_req: InstallRequirement) -> None: - assert install_req.name + assert install_req.name, install_req project_name = canonicalize_name(install_req.name) self.requirements[project_name] = install_req diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index 1ba70c2b39e..c4fd4e28f93 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -180,11 +180,6 @@ def resolve( req_set.add_named_requirement(ireq) - reqs = req_set.all_requirements - self.factory.preparer.prepare_linked_requirements_more(reqs) - for req in reqs: - req.prepared = True - req.needs_more_preparation = False return req_set def get_installation_order( diff --git a/src/pip/_internal/wheel_builder.py b/src/pip/_internal/wheel_builder.py index beed02f00c5..5ced2679e33 100644 --- a/src/pip/_internal/wheel_builder.py +++ b/src/pip/_internal/wheel_builder.py @@ -4,7 +4,6 @@ import logging import os.path -import re import shutil from collections.abc import Iterable @@ -26,23 +25,12 @@ from pip._internal.utils.subprocess import call_subprocess from pip._internal.utils.temp_dir import TempDirectory from pip._internal.utils.urls import path_to_url -from pip._internal.vcs import vcs logger = logging.getLogger(__name__) -_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) - BuildResult = tuple[list[InstallRequirement], list[InstallRequirement]] -def _contains_egg_info(s: str) -> bool: - """Determine whether the string looks like an egg_info. - - :param s: The string to parse. E.g. foo-2.1 - """ - return bool(_egg_info_re.search(s)) - - def _should_build( req: InstallRequirement, ) -> bool: @@ -67,54 +55,6 @@ def should_build_for_install_command( return _should_build(req) -def _should_cache( - req: InstallRequirement, -) -> bool | None: - """ - Return whether a built InstallRequirement can be stored in the persistent - wheel cache, assuming the wheel cache is available, and _should_build() - has determined a wheel needs to be built. - """ - if req.editable or not req.source_dir: - # never cache editable requirements - return False - - if req.link and req.link.is_vcs: - # VCS checkout. Do not cache - # unless it points to an immutable commit hash. - assert not req.editable - assert req.source_dir - vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) - assert vcs_backend - if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): - return True - return False - - assert req.link - base, ext = req.link.splitext() - if _contains_egg_info(base): - return True - - # Otherwise, do not cache. - return False - - -def _get_cache_dir( - req: InstallRequirement, - wheel_cache: WheelCache, -) -> str: - """Return the persistent or temporary cache directory where the built - wheel need to be stored. - """ - cache_available = bool(wheel_cache.cache_dir) - assert req.link - if cache_available and _should_cache(req): - cache_dir = wheel_cache.get_path_for_link(req.link) - else: - cache_dir = wheel_cache.get_ephem_path_for_link(req.link) - return cache_dir - - def _verify_one(req: InstallRequirement, wheel_path: str) -> None: canonical_name = canonicalize_name(req.name or "") w = Wheel(os.path.basename(wheel_path)) @@ -295,7 +235,7 @@ def build( build_successes, build_failures = [], [] for req in requirements: assert req.name - cache_dir = _get_cache_dir(req, wheel_cache) + cache_dir = wheel_cache.resolve_cache_dir(req) wheel_file = _build_one( req, cache_dir, diff --git a/tests/conftest.py b/tests/conftest.py index c98b871170f..80d5ba8bb3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -753,6 +753,9 @@ class FakePackage: requires_dist: tuple[str, ...] = () # This will override the Name specified in the actual dist's METADATA. metadata_name: str | None = None + # Whether to delete the file this points to, which causes any attempt to fetch this + # package to fail unless it is processed as a metadata-only dist. + delete_linked_file: bool = False def metadata_filename(self) -> str: """This is specified by PEP 658.""" @@ -842,6 +845,27 @@ def fake_packages() -> dict[str, list[FakePackage]]: ("simple==1.0",), ), ], + "complex-dist": [ + FakePackage( + "complex-dist", + "0.1", + "complex_dist-0.1-py2.py3-none-any.whl", + MetadataKind.Unhashed, + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata presents no hash itself. + delete_linked_file=True, + ), + ], + "corruptwheel": [ + FakePackage( + "corruptwheel", + "1.0", + "corruptwheel-1.0-py2.py3-none-any.whl", + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata *does* present a hash. + MetadataKind.Sha256, + ), + ], "has-script": [ # Ensure we check PEP 658 metadata hashing errors for wheel files. FakePackage( @@ -927,10 +951,10 @@ def html_index_for_packages( f' {package_link.filename}
' # noqa: E501 ) # (3.2) Copy over the corresponding file in `shared_data.packages`. - shutil.copy( - shared_data.packages / package_link.filename, - pkg_subdir / package_link.filename, - ) + cached_file = shared_data.packages / package_link.filename + new_file = pkg_subdir / package_link.filename + if not package_link.delete_linked_file: + shutil.copy(cached_file, new_file) # (3.3) Write a metadata file, if applicable. if package_link.metadata != MetadataKind.NoFile: with open(pkg_subdir / package_link.metadata_filename(), "wb") as f: @@ -985,7 +1009,8 @@ def html_index_with_onetime_server( """Serve files from a generated pypi index, erroring if a file is downloaded more than once. - Provide `-i http://localhost:8000` to pip invocations to point them at this server. + Provide `-i http://localhost:` to pip invocations to point them at + this server. """ class InDirectoryServer(http.server.ThreadingHTTPServer): @@ -1000,7 +1025,7 @@ def finish_request(self: Self, request: Any, client_address: Any) -> None: class Handler(OneTimeDownloadHandler): _seen_paths: ClassVar[set[str]] = set() - with InDirectoryServer(("", 8000), Handler) as httpd: + with InDirectoryServer(("", 0), Handler) as httpd: server_thread = threading.Thread(target=httpd.serve_forever) server_thread.start() diff --git a/tests/functional/test_check.py b/tests/functional/test_check.py index 06ed1b08eac..f3c309b0f45 100644 --- a/tests/functional/test_check.py +++ b/tests/functional/test_check.py @@ -123,10 +123,7 @@ def test_check_complicated_name_missing(script: PipTestEnvironment) -> None: # Without dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) @@ -149,10 +146,7 @@ def test_check_complicated_name_broken(script: PipTestEnvironment) -> None: # With broken dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -185,10 +179,7 @@ def test_check_complicated_name_clean(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -216,10 +207,7 @@ def test_check_considers_conditional_reqs(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) diff --git a/tests/functional/test_download.py b/tests/functional/test_download.py index c5887aa1bf0..555822a903e 100644 --- a/tests/functional/test_download.py +++ b/tests/functional/test_download.py @@ -1276,6 +1276,7 @@ def download_server_html_index( ) -> Callable[..., tuple[TestPipResult, Path]]: """Execute `pip download` against a generated PyPI index.""" download_dir = tmpdir / "download_dir" + server_port = html_index_with_onetime_server.server_address[1] def run_for_generated_index( args: list[str], @@ -1290,7 +1291,7 @@ def run_for_generated_index( "-d", str(download_dir), "-i", - "http://localhost:8000", + f"http://localhost:{server_port}", *args, ] result = script.pip(*pip_args, allow_error=allow_error) diff --git a/tests/functional/test_install.py b/tests/functional/test_install.py index a1bd81d31d0..5092c28a218 100644 --- a/tests/functional/test_install.py +++ b/tests/functional/test_install.py @@ -2128,7 +2128,7 @@ def test_install_conflict_results_in_warning( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency result2 = script.pip( @@ -2138,7 +2138,7 @@ def test_install_conflict_results_in_warning( allow_stderr_error=True, ) assert "pkga 1.0 requires pkgb==1.0" in result2.stderr, str(result2) - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_install_conflict_warning_can_be_suppressed( @@ -2158,11 +2158,11 @@ def test_install_conflict_warning_can_be_suppressed( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency; suppressing warning result2 = script.pip("install", "--no-index", pkgB_path, "--no-warn-conflicts") - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_target_install_ignores_distutils_config_install_prefix( diff --git a/tests/functional/test_install_check.py b/tests/functional/test_install_check.py index a0598fe2703..7f40a711d73 100644 --- a/tests/functional/test_install_check.py +++ b/tests/functional/test_install_check.py @@ -28,7 +28,7 @@ def test_check_install_canonicalization(script: PipTestEnvironment) -> None: # Let's install pkgA without its dependency result = script.pip("install", "--no-index", pkga_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result.stdout, str(result) + assert "Successfully installed pkga-1.0" in result.stdout, str(result) # Install the first missing dependency. Only an error for the # second dependency should remain. diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py new file mode 100644 index 00000000000..8a5e28b0b1b --- /dev/null +++ b/tests/functional/test_install_metadata.py @@ -0,0 +1,242 @@ +import json +import re +from collections.abc import Iterator +from pathlib import Path +from typing import Any, Callable + +import pytest + +from pip._vendor.packaging.requirements import Requirement + +from pip._internal.models.direct_url import DirectUrl +from pip._internal.utils.urls import path_to_url + +from tests.lib import ( + PipTestEnvironment, + TestPipResult, +) + + +@pytest.fixture +def install_with_generated_html_index( + script: PipTestEnvironment, + html_index_for_packages: Path, + tmpdir: Path, +) -> Callable[..., tuple[TestPipResult, dict[str, Any]]]: + """Execute `pip download` against a generated PyPI index.""" + output_file = tmpdir / "output_file.json" + + def run_for_generated_index( + args: list[str], + *, + dry_run: bool = True, + allow_error: bool = False, + ) -> tuple[TestPipResult, dict[str, Any]]: + """ + Produce a PyPI directory structure pointing to the specified packages, then + execute `pip install --report ... -i ...` pointing to our generated index. + """ + pip_args = [ + "install", + *(("--dry-run",) if dry_run else ()), + "--ignore-installed", + "--report", + str(output_file), + "-i", + path_to_url(str(html_index_for_packages)), + *args, + ] + result = script.pip(*pip_args, allow_error=allow_error) + try: + with open(output_file, "rb") as f: + report = json.load(f) + except FileNotFoundError: + if allow_error: + report = {} + else: + raise + return (result, report) + + return run_for_generated_index + + +def iter_dists(report: dict[str, Any]) -> Iterator[tuple[Requirement, DirectUrl]]: + """Parse a (req,url) tuple from each installed dist in the --report json.""" + for inst in report["install"]: + metadata = inst["metadata"] + name = metadata["name"] + version = metadata["version"] + req = Requirement(f"{name}=={version}") + direct_url = DirectUrl.from_dict(inst["download_info"]) + yield (req, direct_url) + + +@pytest.mark.parametrize( + "requirement_to_install, expected_outputs", + [ + ("simple2==1.0", ["simple2==1.0", "simple==1.0"]), + ("simple==2.0", ["simple==2.0"]), + ( + "colander", + ["colander==0.9.9", "translationstring==1.1"], + ), + ( + "compilewheel", + ["compilewheel==1.0", "simple==1.0"], + ), + ], +) +def test_install_with_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + expected_outputs: list[str], +) -> None: + """Verify that if a data-dist-info-metadata attribute is present, then it is used + instead of the actual dist's METADATA.""" + _, report = install_with_generated_html_index( + [requirement_to_install], + ) + installed = sorted(str(r) for r, _ in iter_dists(report)) + assert installed == expected_outputs + + +@pytest.mark.parametrize( + "requirement_to_install, real_hash", + [ + ( + "simple==3.0", + "95e0f200b6302989bcf2cead9465cf229168295ea330ca30d1ffeab5c0fed996", + ), + ( + "has-script", + "16ba92d7f6f992f6de5ecb7d58c914675cf21f57f8e674fb29dcb4f4c9507e5b", + ), + ], +) +def test_incorrect_metadata_hash( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + real_hash: str, +) -> None: + """Verify that if a hash for data-dist-info-metadata is provided, it must match the + actual hash of the metadata file.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_msg = f"""\ + Expected sha256 wrong-hash + Got {real_hash}""" + assert expected_msg in result.stderr + + +@pytest.mark.parametrize( + "requirement_to_install, expected_url", + [ + ("simple2==2.0", "simple2-2.0.tar.gz.metadata"), + ("priority", "priority-1.0-py2.py3-none-any.whl.metadata"), + ], +) +def test_metadata_not_found( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + expected_url: str, +) -> None: + """Verify that if a data-dist-info-metadata attribute is provided, that pip will + fetch the .metadata file at the location specified by PEP 658, and error + if unavailable.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_re = re.escape(expected_url) + pattern = re.compile( + f"ERROR: 404 Client Error: FileNotFoundError for url:.*{expected_re}" + ) + assert pattern.search(result.stderr), (pattern, result.stderr) + + +def test_produces_error_for_mismatched_package_name_in_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], +) -> None: + """Verify that the package name from the metadata matches the requested package.""" + result, _ = install_with_generated_html_index( + ["simple2==3.0"], + allow_error=True, + ) + assert result.returncode != 0 + assert ( + "simple2-3.0.tar.gz has inconsistent Name: expected 'simple2', but metadata " + "has 'not-simple2'" + ) in result.stdout + + +@pytest.mark.parametrize( + "requirement", + [ + "requires-simple-extra==0.1", + "REQUIRES_SIMPLE-EXTRA==0.1", + "REQUIRES....simple-_-EXTRA==0.1", + ], +) +def test_canonicalizes_package_name_before_verifying_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement: str, +) -> None: + """Verify that the package name from the command line and the package's + METADATA are both canonicalized before comparison, while the name from the METADATA + is always used verbatim to represent the installed candidate in --report. + + Regression test for https://github.com/pypa/pip/issues/12038 + """ + _, report = install_with_generated_html_index( + [requirement], + ) + reqs = [str(r) for r, _ in iter_dists(report)] + assert reqs == ["Requires_Simple.Extra==0.1"] + + +@pytest.mark.parametrize( + "requirement,err_string", + [ + # It's important that we verify pip won't even attempt to fetch the file, so we + # construct an input that will cause it to error if it tries at all. + ( + "complex-dist==0.1", + "Could not install packages due to an OSError: [Errno 2] No such file or directory", # noqa: E501 + ), + ("corruptwheel==1.0", ".whl is invalid."), + ], +) +def test_dry_run_avoids_downloading_metadata_only_dists( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement: str, + err_string: str, +) -> None: + """Verify that the underlying dist files are not downloaded at all when + `install --dry-run` is used to resolve dists with PEP 658 metadata.""" + _, report = install_with_generated_html_index( + [requirement], + ) + assert [requirement] == [str(r) for r, _ in iter_dists(report)] + result, _ = install_with_generated_html_index( + [requirement], + dry_run=False, + allow_error=True, + ) + assert result.returncode != 0 + assert err_string in result.stderr diff --git a/tests/functional/test_install_reqs.py b/tests/functional/test_install_reqs.py index 3c5b6db4a68..66497c9eb70 100644 --- a/tests/functional/test_install_reqs.py +++ b/tests/functional/test_install_reqs.py @@ -728,7 +728,7 @@ def test_install_distribution_full_union( result = script.pip_install_local( to_install, f"{to_install}[bar]", f"{to_install}[baz]" ) - assert "Building wheel for LocalExtras" in result.stdout + assert "Building wheel for localextras" in result.stdout result.did_create(script.site_packages / "simple") result.did_create(script.site_packages / "singlemodule.py") diff --git a/tests/functional/test_wheel.py b/tests/functional/test_wheel.py index e1ede880496..3496522c4b0 100644 --- a/tests/functional/test_wheel.py +++ b/tests/functional/test_wheel.py @@ -322,7 +322,7 @@ def test_wheel_package_with_latin1_setup( pkg_to_wheel = data.packages.joinpath("SetupPyLatin1") result = script.pip("wheel", pkg_to_wheel) - assert "Successfully built SetupPyUTF8" in result.stdout + assert "Successfully built setuppyutf8" in result.stdout def test_pip_wheel_with_pep518_build_reqs( diff --git a/tests/unit/metadata/test_metadata_pkg_resources.py b/tests/unit/metadata/test_metadata_pkg_resources.py index ccb0b7dcf0f..64433803893 100644 --- a/tests/unit/metadata/test_metadata_pkg_resources.py +++ b/tests/unit/metadata/test_metadata_pkg_resources.py @@ -105,6 +105,7 @@ def test_wheel_metadata_works() -> None: metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, ""), project_name=name, ), + concrete=False, ) assert name == dist.canonical_name == dist.raw_name diff --git a/tests/unit/test_cache.py b/tests/unit/test_cache.py index 30cdb6ebece..d7dc91bfcbb 100644 --- a/tests/unit/test_cache.py +++ b/tests/unit/test_cache.py @@ -1,13 +1,33 @@ import os from pathlib import Path +import pytest + from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version -from pip._internal.cache import WheelCache, _hash_dict +from pip._internal.cache import WheelCache, _contains_egg_info, _hash_dict from pip._internal.models.link import Link from pip._internal.utils.misc import ensure_dir +@pytest.mark.parametrize( + "s, expected", + [ + # Trivial. + ("pip-18.0", True), + # Ambiguous. + ("foo-2-2", True), + ("im-valid", True), + # Invalid. + ("invalid", False), + ("im_invalid", False), + ], +) +def test_contains_egg_info(s: str, expected: bool) -> None: + result = _contains_egg_info(s) + assert result == expected + + def test_falsey_path_none() -> None: wc = WheelCache("") assert wc.cache_dir is None diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index fa688f8e42f..56446bb7aa9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -98,7 +98,7 @@ def test_get_simple_response_archive_to_http_scheme( session.assert_has_calls( [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), ] ) mock_raise_for_status.assert_called_once_with(session.head.return_value) @@ -160,7 +160,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html( assert resp is not None assert session.mock_calls == [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), mock.call.get( url, headers={ @@ -248,7 +248,7 @@ def test_get_simple_response_dont_log_clear_text_password( assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ @@ -256,6 +256,9 @@ def test_get_simple_response_dont_log_clear_text_password( ] record = caplog.records[1] assert record.levelname == "DEBUG" + assert record.message.splitlines() == ["headers: None"] + record = caplog.records[2] + assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Fetched page https://user:****@example.com/simple/ as text/html", ] @@ -849,7 +852,7 @@ def test_get_index_content_directory_append_index(tmpdir: Path) -> None: mock_func.return_value = fake_response actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ - mock.call(expected_url, session=session), + mock.call(expected_url, headers=None, session=session), ], f"actual calls: {mock_func.mock_calls}" assert actual is not None @@ -965,6 +968,7 @@ def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None: # _get_simple_response(). mock_get_simple_response.assert_called_once_with( url, + headers=None, session=link_collector.session, ) diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index a2c4cf243ca..f3a79351221 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -28,6 +28,7 @@ PreviousBuildDirError, ) from pip._internal.index.package_finder import PackageFinder +from pip._internal.metadata import get_metadata_distribution from pip._internal.models.direct_url import ArchiveInfo, DirectUrl, DirInfo, VcsInfo from pip._internal.models.link import Link from pip._internal.network.session import PipSession @@ -154,7 +155,11 @@ def test_no_reuse_existing_build_dir(self, data: TestData) -> None: ): resolver.resolve(reqset.all_requirements, True) - def test_environment_marker_extras(self, data: TestData) -> None: + def test_environment_marker_extras( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """ Test that the environment marker extras are used with non-wheel installs. @@ -164,6 +169,13 @@ def test_environment_marker_extras(self, data: TestData) -> None: os.fspath(data.packages.joinpath("LocalEnvironMarker")), ) req.user_supplied = True + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + req, "cache_concrete_dist", partial(cache_concrete_dist, req) + ) reqset.add_unnamed_requirement(req) finder = make_test_finder(find_links=[data.find_links]) with self._basic_resolver(finder) as resolver: @@ -505,12 +517,23 @@ def test_download_info_local_dir(self, data: TestData) -> None: assert req.download_info.url.startswith("file://") assert isinstance(req.download_info.info, DirInfo) - def test_download_info_local_editable_dir(self, data: TestData) -> None: + def test_download_info_local_editable_dir( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """Test that download_info is set for requirements from a local editable dir.""" finder = make_test_finder() with self._basic_resolver(finder) as resolver: ireq_url = data.packages.joinpath("FSPkg").as_uri() ireq = get_processed_req_from_line(f"-e {ireq_url}#egg=FSPkg") + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + ireq, "cache_concrete_dist", partial(cache_concrete_dist, ireq) + ) reqset = resolver.resolve([ireq], True) assert len(reqset.all_requirements) == 1 req = reqset.all_requirements[0] @@ -915,7 +938,9 @@ def test_mismatched_versions(caplog: pytest.LogCaptureFixture) -> None: metadata = email.message.Message() metadata["name"] = "simplewheel" metadata["version"] = "1.0" - req._metadata = metadata + req._dist = get_metadata_distribution( + bytes(metadata), "simplewheel-1.0.whl", "simplewheel" + ) req.assert_source_matches_version() assert caplog.records[-1].message == ( diff --git a/tests/unit/test_wheel_builder.py b/tests/unit/test_wheel_builder.py index 0547ac818bc..b2ed3a89b28 100644 --- a/tests/unit/test_wheel_builder.py +++ b/tests/unit/test_wheel_builder.py @@ -8,7 +8,8 @@ import pytest -from pip._internal import wheel_builder +from pip._internal import cache, wheel_builder +from pip._internal.cache import _contains_egg_info from pip._internal.models.link import Link from pip._internal.operations.build.wheel_legacy import format_command_result from pip._internal.req.req_install import InstallRequirement @@ -31,7 +32,7 @@ ], ) def test_contains_egg_info(s: str, expected: bool) -> None: - result = wheel_builder._contains_egg_info(s) + result = _contains_egg_info(s) assert result == expected @@ -96,7 +97,7 @@ def test_should_build_for_install_command(req: ReqMock, expected: bool) -> None: ], ) def test_should_cache(req: ReqMock, expected: bool) -> None: - assert wheel_builder._should_cache(cast(InstallRequirement, req)) is expected + assert cache.should_cache(cast(InstallRequirement, req)) is expected def test_should_cache_git_sha(tmpdir: Path) -> None: @@ -106,12 +107,12 @@ def test_should_cache_git_sha(tmpdir: Path) -> None: # a link referencing a sha should be cached url = "git+https://g.c/o/r@" + commit + "#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert wheel_builder._should_cache(cast(InstallRequirement, req)) + assert cache.should_cache(cast(InstallRequirement, req)) # a link not referencing a sha should not be cached url = "git+https://g.c/o/r@master#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert not wheel_builder._should_cache(cast(InstallRequirement, req)) + assert not cache.should_cache(cast(InstallRequirement, req)) def test_format_command_result__INFO(caplog: pytest.LogCaptureFixture) -> None: