From 12fa45ac51da3c78e8bdfa56cb6c0ff5c788ca59 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sat, 20 Jul 2024 14:13:50 -0400 Subject: [PATCH 1/5] cache "concrete" dists by Distribution instead of InstallRequirement --- news/12863.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 5 ++ src/pip/_internal/distributions/base.py | 19 +++- src/pip/_internal/distributions/installed.py | 6 +- src/pip/_internal/distributions/sdist.py | 18 +++- src/pip/_internal/distributions/wheel.py | 4 +- src/pip/_internal/metadata/base.py | 21 +++++ .../_internal/metadata/importlib/_dists.py | 19 +++- src/pip/_internal/metadata/importlib/_envs.py | 4 +- src/pip/_internal/metadata/pkg_resources.py | 15 ++-- src/pip/_internal/operations/prepare.py | 47 ++++++---- src/pip/_internal/req/req_install.py | 87 +++++++++++++------ .../resolution/resolvelib/resolver.py | 2 +- .../metadata/test_metadata_pkg_resources.py | 1 + tests/unit/test_req.py | 31 ++++++- 15 files changed, 214 insertions(+), 66 deletions(-) create mode 100644 news/12863.trivial.rst diff --git a/news/12863.trivial.rst b/news/12863.trivial.rst new file mode 100644 index 00000000000..dc36a82a0df --- /dev/null +++ b/news/12863.trivial.rst @@ -0,0 +1 @@ +Cache "concrete" dists by ``Distribution`` instead of ``InstallRequirement``. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index 9a89a838b9a..f6089daf40d 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -1,4 +1,5 @@ from pip._internal.distributions.base import AbstractDistribution +from pip._internal.distributions.installed import InstalledDistribution from pip._internal.distributions.sdist import SourceDistribution from pip._internal.distributions.wheel import WheelDistribution from pip._internal.req.req_install import InstallRequirement @@ -8,6 +9,10 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: """Returns a Distribution for the given InstallRequirement""" + # Only pre-installed requirements will have a .satisfied_by dist. + if install_req.satisfied_by: + return InstalledDistribution(install_req) + # Editable requirements will always be source distributions. They use the # legacy logic until we create a modern standard for them. if install_req.editable: diff --git a/src/pip/_internal/distributions/base.py b/src/pip/_internal/distributions/base.py index ea61f3501e7..3ccb5bd0288 100644 --- a/src/pip/_internal/distributions/base.py +++ b/src/pip/_internal/distributions/base.py @@ -39,11 +39,17 @@ def build_tracker_id(self) -> str | None: If None, then this dist has no work to do in the build tracker, and ``.prepare_distribution_metadata()`` will not be called.""" - raise NotImplementedError() + ... @abc.abstractmethod def get_metadata_distribution(self) -> BaseDistribution: - raise NotImplementedError() + """Generate a concrete ``BaseDistribution`` instance for this artifact. + + The implementation should also cache the result with + ``self.req.cache_concrete_dist()`` so the distribution is available to other + users of the ``InstallRequirement``. This method is not called within the build + tracker context, so it should not identify any new setup requirements.""" + ... @abc.abstractmethod def prepare_distribution_metadata( @@ -52,4 +58,11 @@ def prepare_distribution_metadata( build_isolation: bool, check_build_deps: bool, ) -> None: - raise NotImplementedError() + """Generate the information necessary to extract metadata from the artifact. + + This method will be executed within the context of ``BuildTracker#track()``, so + it needs to fully identify any setup requirements so they can be added to the + same active set of tracked builds, while ``.get_metadata_distribution()`` takes + care of generating and caching the ``BaseDistribution`` to expose to the rest of + the resolve.""" + ... diff --git a/src/pip/_internal/distributions/installed.py b/src/pip/_internal/distributions/installed.py index b6a67df24f4..5e6ee336866 100644 --- a/src/pip/_internal/distributions/installed.py +++ b/src/pip/_internal/distributions/installed.py @@ -21,8 +21,10 @@ def build_tracker_id(self) -> str | None: return None def get_metadata_distribution(self) -> BaseDistribution: - assert self.req.satisfied_by is not None, "not actually installed" - return self.req.satisfied_by + dist = self.req.satisfied_by + assert dist is not None, "not actually installed" + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/distributions/sdist.py b/src/pip/_internal/distributions/sdist.py index e2821f89e00..c3b91cef621 100644 --- a/src/pip/_internal/distributions/sdist.py +++ b/src/pip/_internal/distributions/sdist.py @@ -7,7 +7,7 @@ from pip._internal.build_env import BuildEnvironment from pip._internal.distributions.base import AbstractDistribution from pip._internal.exceptions import InstallationError -from pip._internal.metadata import BaseDistribution +from pip._internal.metadata import BaseDistribution, get_directory_distribution from pip._internal.utils.subprocess import runner_with_spinner_message if TYPE_CHECKING: @@ -24,13 +24,19 @@ class SourceDistribution(AbstractDistribution): """ @property - def build_tracker_id(self) -> str | None: + def build_tracker_id(self) -> str: """Identify this requirement uniquely by its link.""" assert self.req.link return self.req.link.url_without_fragment def get_metadata_distribution(self) -> BaseDistribution: - return self.req.get_dist() + assert ( + self.req.metadata_directory + ), "Set as part of .prepare_distribution_metadata()" + dist = get_directory_distribution(self.req.metadata_directory) + self.req.cache_concrete_dist(dist) + self.req.validate_sdist_metadata() + return dist def prepare_distribution_metadata( self, @@ -69,7 +75,11 @@ def prepare_distribution_metadata( self._raise_conflicts("the backend dependencies", conflicting) if missing: self._raise_missing_reqs(missing) - self.req.prepare_metadata() + + # NB: we must still call .cache_concrete_dist() and .validate_sdist_metadata() + # before the InstallRequirement itself has been updated with the metadata from + # this directory! + self.req.prepare_metadata_directory() def _prepare_build_backend( self, build_env_installer: BuildEnvironmentInstaller diff --git a/src/pip/_internal/distributions/wheel.py b/src/pip/_internal/distributions/wheel.py index ee12bfadc2e..61898712d42 100644 --- a/src/pip/_internal/distributions/wheel.py +++ b/src/pip/_internal/distributions/wheel.py @@ -33,7 +33,9 @@ def get_metadata_distribution(self) -> BaseDistribution: assert self.req.local_file_path, "Set as part of preparation during download" assert self.req.name, "Wheels are never unnamed" wheel = FilesystemWheel(self.req.local_file_path) - return get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + dist = get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 230e11473c6..5ed9c5d0fa6 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -92,6 +92,15 @@ class RequiresEntry(NamedTuple): class BaseDistribution(Protocol): + @property + def is_concrete(self) -> bool: + """Whether the distribution really exists somewhere on disk. + + If this is false, it has been synthesized from metadata, e.g. via + ``.from_metadata_file_contents()``, or ``.from_wheel()`` against + a ``MemoryWheel``.""" + raise NotImplementedError() + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: """Load the distribution from a metadata directory. @@ -664,6 +673,10 @@ def iter_installed_distributions( class Wheel(Protocol): location: str + @property + def is_concrete(self) -> bool: + raise NotImplementedError() + def as_zipfile(self) -> zipfile.ZipFile: raise NotImplementedError() @@ -672,6 +685,10 @@ class FilesystemWheel(Wheel): def __init__(self, location: str) -> None: self.location = location + @property + def is_concrete(self) -> bool: + return True + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.location, allowZip64=True) @@ -681,5 +698,9 @@ def __init__(self, location: str, stream: IO[bytes]) -> None: self.location = location self.stream = stream + @property + def is_concrete(self) -> bool: + return False + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.stream, allowZip64=True) diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index 97363af9a55..cb9ceb2e19e 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -104,16 +104,22 @@ def __init__( dist: importlib.metadata.Distribution, info_location: BasePath | None, installed_location: BasePath | None, + concrete: bool, ) -> None: self._dist = dist self._info_location = info_location self._installed_location = installed_location + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: info_location = pathlib.Path(directory) dist = importlib.metadata.Distribution.at(info_location) - return cls(dist, info_location, info_location.parent) + return cls(dist, info_location, info_location.parent, concrete=True) @classmethod def from_metadata_file_contents( @@ -130,7 +136,7 @@ def from_metadata_file_contents( metadata_path.write_bytes(metadata_contents) # Construct dist pointing to the newly created directory. dist = importlib.metadata.Distribution.at(metadata_path.parent) - return cls(dist, metadata_path.parent, None) + return cls(dist, metadata_path.parent, None, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -139,7 +145,14 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: dist = WheelDistribution.from_zipfile(zf, name, wheel.location) except zipfile.BadZipFile as e: raise InvalidWheel(wheel.location, name) from e - return cls(dist, dist.info_location, pathlib.PurePosixPath(wheel.location)) + except UnsupportedWheel as e: + raise UnsupportedWheel(f"{name} has an invalid wheel, {e}") + return cls( + dist, + dist.info_location, + pathlib.PurePosixPath(wheel.location), + concrete=wheel.is_concrete, + ) @property def location(self) -> str | None: diff --git a/src/pip/_internal/metadata/importlib/_envs.py b/src/pip/_internal/metadata/importlib/_envs.py index 71a73b7311f..0aeade7857d 100644 --- a/src/pip/_internal/metadata/importlib/_envs.py +++ b/src/pip/_internal/metadata/importlib/_envs.py @@ -86,7 +86,7 @@ def find(self, location: str) -> Iterator[BaseDistribution]: installed_location: BasePath | None = None else: installed_location = info_location.parent - yield Distribution(dist, info_location, installed_location) + yield Distribution(dist, info_location, installed_location, concrete=True) def find_legacy_editables(self, location: str) -> Iterator[BaseDistribution]: """Read location in egg-link files and return distributions in there. @@ -110,7 +110,7 @@ def find_legacy_editables(self, location: str) -> Iterator[BaseDistribution]: continue target_location = str(path.joinpath(target_rel)) for dist, info_location in self._find_impl(target_location): - yield Distribution(dist, info_location, path) + yield Distribution(dist, info_location, path, concrete=True) class Environment(BaseEnvironment): diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index 89fce8b6e5d..1b9a59458f7 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -78,8 +78,9 @@ def run_script(self, script_name: str, namespace: str) -> None: class Distribution(BaseDistribution): - def __init__(self, dist: pkg_resources.Distribution) -> None: + def __init__(self, dist: pkg_resources.Distribution, concrete: bool) -> None: self._dist = dist + self._concrete = concrete # This is populated lazily, to avoid loading metadata for all possible # distributions eagerly. self.__extra_mapping: Mapping[NormalizedName, str] | None = None @@ -93,6 +94,10 @@ def _extra_mapping(self) -> Mapping[NormalizedName, str]: return self.__extra_mapping + @property + def is_concrete(self) -> bool: + return self._concrete + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: dist_dir = directory.rstrip(os.sep) @@ -111,7 +116,7 @@ def from_directory(cls, directory: str) -> BaseDistribution: dist_name = os.path.splitext(dist_dir_name)[0].split("-")[0] dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata) - return cls(dist) + return cls(dist, concrete=True) @classmethod def from_metadata_file_contents( @@ -128,7 +133,7 @@ def from_metadata_file_contents( metadata=InMemoryMetadata(metadata_dict, filename), project_name=project_name, ) - return cls(dist) + return cls(dist, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -149,7 +154,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: metadata=InMemoryMetadata(metadata_dict, wheel.location), project_name=name, ) - return cls(dist) + return cls(dist, concrete=wheel.is_concrete) @property def location(self) -> str | None: @@ -261,7 +266,7 @@ def from_paths(cls, paths: list[str] | None) -> BaseEnvironment: def _iter_distributions(self) -> Iterator[BaseDistribution]: for dist in self._ws: - yield Distribution(dist) + yield Distribution(dist, concrete=True) def _search_distribution(self, name: str) -> BaseDistribution | None: """Find a distribution matching the ``name`` in the environment. diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 00b1a33a030..f2d72c0274a 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -16,7 +16,6 @@ from pip._internal.build_env import BuildEnvironmentInstaller from pip._internal.distributions import make_distribution_for_install_requirement -from pip._internal.distributions.installed import InstalledDistribution from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, HashMismatch, @@ -200,6 +199,8 @@ def _check_download_dir( ) -> str | None: """Check download_dir for previously downloaded file with correct hash If a correct file is found return its path else None + + If a file is found at the given path, but with an invalid hash, the file is deleted. """ download_path = os.path.join(download_dir, link.filename) @@ -530,7 +531,9 @@ def prepare_linked_requirement( # The file is not available, attempt to fetch only metadata metadata_dist = self._fetch_metadata_only(req) if metadata_dist is not None: - req.needs_more_preparation = True + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) return metadata_dist # None of the optimizations worked, fully prepare the requirement @@ -540,27 +543,27 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - reqs = [req for req in reqs if req.needs_more_preparation] + partially_downloaded_reqs: list[InstallRequirement] = [] for req in reqs: + if req.is_concrete: + continue + # Determine if any of these requirements were already downloaded. if self.download_dir is not None and req.link.is_wheel: hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir(req.link, self.download_dir, hashes) + # If the file is there, but doesn't match the hash, delete it and print + # a warning. We will be downloading it again via + # partially_downloaded_reqs. + file_path = _check_download_dir( + req.link, self.download_dir, hashes, warn_on_hash_mismatch=True + ) if file_path is not None: + # If the hash does match, then we still need to generate a concrete + # dist, but we don't have to download the wheel again. self._downloaded[req.link.url] = file_path - req.needs_more_preparation = False - # Prepare requirements we found were already downloaded for some - # reason. The other downloads will be completed separately. - partially_downloaded_reqs: list[InstallRequirement] = [] - for req in reqs: - if req.needs_more_preparation: - partially_downloaded_reqs.append(req) - else: - self._prepare_linked_requirement(req, parallel_builds) + partially_downloaded_reqs.append(req) - # TODO: separate this part out from RequirementPreparer when the v1 - # resolver can be removed! self._complete_partial_requirements( partially_downloaded_reqs, parallel_builds=parallel_builds, @@ -661,6 +664,7 @@ def _prepare_linked_requirement( def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None + assert req.is_concrete link = req.link if link.is_vcs or (link.is_existing_dir() and req.editable): # Make a .zip of the source_dir we already created. @@ -715,6 +719,8 @@ def prepare_editable_requirement( req.check_if_exists(self.use_user_site) + # This should already have been populated by the preparation of the source dist. + assert req.is_concrete return dist def prepare_installed_requirement( @@ -739,4 +745,13 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - return InstalledDistribution(req).get_metadata_distribution() + dist = _get_prepared_distribution( + req, + self.build_tracker, + self.build_env_installer, + self.build_isolation, + self.check_build_deps, + ) + + assert req.is_concrete + return dist diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index c9f6bff17e8..9e64c26d384 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -10,7 +10,7 @@ from collections.abc import Collection, Iterable, Sequence from optparse import Values from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING from pip._vendor.packaging.markers import Marker from pip._vendor.packaging.requirements import Requirement @@ -26,10 +26,7 @@ from pip._internal.metadata import ( BaseDistribution, get_default_environment, - get_directory_distribution, - get_wheel_distribution, ) -from pip._internal.metadata.base import FilesystemWheel from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.operations.build.metadata import generate_metadata @@ -62,6 +59,9 @@ from pip._internal.utils.virtualenv import running_under_virtualenv from pip._internal.vcs import vcs +if TYPE_CHECKING: + import email.message + logger = logging.getLogger(__name__) @@ -153,6 +153,7 @@ def __init__( self.hash_options = hash_options if hash_options else {} self.config_settings = config_settings # Set to True after successful preparation of this requirement + # TODO: this is only used in the legacy resolver: remove this! self.prepared = False # User supplied requirement are explicitly requested for installation # by the user via CLI arguments or requirements files, as opposed to, @@ -194,8 +195,11 @@ def __init__( ) self.use_pep517 = True - # This requirement needs more preparation before it can be built - self.needs_more_preparation = False + # When a dist is computed for this requirement, cache it here so it's visible + # everywhere within pip and isn't computed more than once. This may be + # a "virtual" dist without a physical location on the filesystem, or + # a "concrete" dist which has been fully downloaded. + self._dist: BaseDistribution | None = None # This requirement needs to be unpacked before it can be installed. self._archive_source: Path | None = None @@ -553,11 +557,11 @@ def isolated_editable_sanity_check(self) -> None: f"Consider using a build backend that supports PEP 660." ) - def prepare_metadata(self) -> None: + def prepare_metadata_directory(self) -> None: """Ensure that project metadata is available. - Under PEP 517 and PEP 660, call the backend hook to prepare the metadata. - Under legacy processing, call setup.py egg-info. + Under PEP 517 and PEP 660, call the backend hook to prepare the metadata + directory. Under legacy processing, call setup.py egg-info. """ assert self.source_dir, f"No source dir for {self}" details = self.name or f"from {self.link}" @@ -589,6 +593,8 @@ def prepare_metadata(self) -> None: details=details, ) + def validate_sdist_metadata(self) -> None: + """Ensure that we have a dist, and ensure it corresponds to expectations.""" # Act on the newly generated metadata, based on the name and version. if not self.name: self._set_requirement() @@ -598,25 +604,54 @@ def prepare_metadata(self) -> None: self.assert_source_matches_version() @property - def metadata(self) -> Any: - if not hasattr(self, "_metadata"): - self._metadata = self.get_dist().metadata - - return self._metadata + def metadata(self) -> email.message.Message: + return self.get_dist().metadata def get_dist(self) -> BaseDistribution: - if self.metadata_directory: - return get_directory_distribution(self.metadata_directory) - elif self.local_file_path and self.is_wheel: - assert self.req is not None - return get_wheel_distribution( - FilesystemWheel(self.local_file_path), - canonicalize_name(self.req.name), - ) - raise AssertionError( - f"InstallRequirement {self} has no metadata directory and no wheel: " - f"can't make a distribution." - ) + """Retrieve the dist resolved from this requirement. + + :raises AssertionError: if the resolver has not yet been executed. + """ + if self._dist is None: + raise AssertionError(f"{self!r} has no dist associated.") + return self._dist + + def cache_virtual_metadata_only_dist(self, dist: BaseDistribution) -> None: + """Associate a "virtual" metadata-only dist to this requirement. + + This dist cannot be installed, but it can be used to complete the resolve + process. + + :raises AssertionError: if a dist has already been associated. + :raises AssertionError: if the provided dist is "concrete", i.e. exists + somewhere on the filesystem. + """ + assert self._dist is None, self + assert not dist.is_concrete, dist + self._dist = dist + + def cache_concrete_dist(self, dist: BaseDistribution) -> None: + """Associate a "concrete" dist to this requirement. + + A concrete dist exists somewhere on the filesystem and can be installed. + + :raises AssertionError: if a concrete dist has already been associated. + :raises AssertionError: if the provided dist is not concrete. + """ + if self._dist is not None: + # If we set a dist twice for the same requirement, we must be hydrating + # a concrete dist for what was previously virtual. This will occur in the + # case of `install --dry-run` when PEP 658 metadata is available. + + # TODO(#12186): avoid setting dist twice! + # assert not self._dist.is_concrete + pass + assert dist.is_concrete + self._dist = dist + + @property + def is_concrete(self) -> bool: + return self._dist is not None and self._dist.is_concrete def assert_source_matches_version(self) -> None: assert self.source_dir, f"No source dir for {self}" diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index 1ba70c2b39e..17086ce417d 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -184,7 +184,7 @@ def resolve( self.factory.preparer.prepare_linked_requirements_more(reqs) for req in reqs: req.prepared = True - req.needs_more_preparation = False + assert req.is_concrete return req_set def get_installation_order( diff --git a/tests/unit/metadata/test_metadata_pkg_resources.py b/tests/unit/metadata/test_metadata_pkg_resources.py index ccb0b7dcf0f..64433803893 100644 --- a/tests/unit/metadata/test_metadata_pkg_resources.py +++ b/tests/unit/metadata/test_metadata_pkg_resources.py @@ -105,6 +105,7 @@ def test_wheel_metadata_works() -> None: metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, ""), project_name=name, ), + concrete=False, ) assert name == dist.canonical_name == dist.raw_name diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index a2c4cf243ca..f3a79351221 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -28,6 +28,7 @@ PreviousBuildDirError, ) from pip._internal.index.package_finder import PackageFinder +from pip._internal.metadata import get_metadata_distribution from pip._internal.models.direct_url import ArchiveInfo, DirectUrl, DirInfo, VcsInfo from pip._internal.models.link import Link from pip._internal.network.session import PipSession @@ -154,7 +155,11 @@ def test_no_reuse_existing_build_dir(self, data: TestData) -> None: ): resolver.resolve(reqset.all_requirements, True) - def test_environment_marker_extras(self, data: TestData) -> None: + def test_environment_marker_extras( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """ Test that the environment marker extras are used with non-wheel installs. @@ -164,6 +169,13 @@ def test_environment_marker_extras(self, data: TestData) -> None: os.fspath(data.packages.joinpath("LocalEnvironMarker")), ) req.user_supplied = True + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + req, "cache_concrete_dist", partial(cache_concrete_dist, req) + ) reqset.add_unnamed_requirement(req) finder = make_test_finder(find_links=[data.find_links]) with self._basic_resolver(finder) as resolver: @@ -505,12 +517,23 @@ def test_download_info_local_dir(self, data: TestData) -> None: assert req.download_info.url.startswith("file://") assert isinstance(req.download_info.info, DirInfo) - def test_download_info_local_editable_dir(self, data: TestData) -> None: + def test_download_info_local_editable_dir( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """Test that download_info is set for requirements from a local editable dir.""" finder = make_test_finder() with self._basic_resolver(finder) as resolver: ireq_url = data.packages.joinpath("FSPkg").as_uri() ireq = get_processed_req_from_line(f"-e {ireq_url}#egg=FSPkg") + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + ireq, "cache_concrete_dist", partial(cache_concrete_dist, ireq) + ) reqset = resolver.resolve([ireq], True) assert len(reqset.all_requirements) == 1 req = reqset.all_requirements[0] @@ -915,7 +938,9 @@ def test_mismatched_versions(caplog: pytest.LogCaptureFixture) -> None: metadata = email.message.Message() metadata["name"] = "simplewheel" metadata["version"] = "1.0" - req._metadata = metadata + req._dist = get_metadata_distribution( + bytes(metadata), "simplewheel-1.0.whl", "simplewheel" + ) req.assert_source_matches_version() assert caplog.records[-1].message == ( From a0e34bb45d24a787f21dc765ea2e8f7a5ab2f154 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:09:04 -0400 Subject: [PATCH 2/5] refactor much of RequirementPreparer and add lengthy docs --- news/12871.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 9 +- src/pip/_internal/operations/prepare.py | 323 +++++++++++--------- 3 files changed, 180 insertions(+), 153 deletions(-) create mode 100644 news/12871.trivial.rst diff --git a/news/12871.trivial.rst b/news/12871.trivial.rst new file mode 100644 index 00000000000..186e2bcb3c4 --- /dev/null +++ b/news/12871.trivial.rst @@ -0,0 +1 @@ +Refactor much of ``RequirementPreparer`` to avoid duplicated code paths for metadata-only requirements. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index f6089daf40d..a870b227d1e 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -8,7 +8,14 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: - """Returns a Distribution for the given InstallRequirement""" + """Returns an AbstractDistribution for the given InstallRequirement. + + As AbstractDistribution only covers installable artifacts, this method may only be + invoked at the conclusion of a resolve, when the RequirementPreparer has downloaded + the file corresponding to the resolved dist. Commands which intend to consume + metadata-only resolves without downloading should not call this method or + consume AbstractDistribution objects. + """ # Only pre-installed requirements will have a .satisfied_by dist. if install_req.satisfied_by: return InstalledDistribution(install_req) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index f2d72c0274a..fd600d16272 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -68,7 +68,17 @@ def _get_prepared_distribution( build_isolation: bool, check_build_deps: bool, ) -> BaseDistribution: - """Prepare a distribution for installation.""" + """Prepare a distribution for installation. + + This method will only be called by the preparer at the end of the resolve, and only + for commands which need installable artifacts (not just resolved metadata). If the + dist was previously metadata-only, the preparer must have downloaded the file + corresponding to the dist and set ``req.local_file_path``. + + This method will execute ``req.cache_concrete_dist()``, so that after invocation, + ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete + ``Distribution``. + """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id if tracker_id is not None: @@ -318,11 +328,7 @@ def _ensure_link_req_src_dir( self, req: InstallRequirement, parallel_builds: bool ) -> None: """Ensure source_dir of a linked InstallRequirement.""" - # Since source_dir is only set for editable requirements. - if req.link.is_wheel: - # We don't need to unpack wheels, so no need for a source - # directory. - return + assert not req.link.is_wheel assert req.source_dir is None if req.link.is_existing_dir(): # build local directories in-tree @@ -369,6 +375,47 @@ def _get_linked_req_hashes(self, req: InstallRequirement) -> Hashes: # showing the user what the hash should be. return req.hashes(trust_internet=False) or MissingHashes() + def _rewrite_link_and_hashes_for_cached_wheel( + self, req: InstallRequirement, hashes: Hashes + ) -> Hashes | None: + """Check the hash of the requirement's source eagerly and rewrite its link. + + ``req.link`` is unconditionally rewritten to the cached wheel source so that the + requirement corresponds to where it was actually downloaded from instead of the + local cache entry. + + :returns: None if the source hash validated successfully. + """ + assert hashes + assert req.is_wheel_from_cache + assert req.download_info is not None + assert req.link.is_wheel + assert req.link.is_file + + # TODO: is it possible to avoid providing a "wrong" req.link in the first place + # in the resolver, instead of having to patch it up afterwards? + req.link = req.cached_wheel_source_link + + # We need to verify hashes, and we have found the requirement in the cache + # of locally built wheels. + if ( + isinstance(req.download_info.info, ArchiveInfo) + and req.download_info.info.hashes + and hashes.has_one_of(req.download_info.info.hashes) + ): + # At this point we know the requirement was built from a hashable source + # artifact, and we verified that the cache entry's hash of the original + # artifact matches one of the hashes we expect. We don't verify hashes + # against the cached wheel, because the wheel is not the original. + return None + + logger.warning( + "The hashes of the source archive found in cache entry " + "don't match, ignoring cached built wheel " + "and re-downloading source." + ) + return hashes + def _fetch_metadata_only( self, req: InstallRequirement, @@ -460,7 +507,7 @@ def _fetch_metadata_using_lazy_wheel( def _complete_partial_requirements( self, - partially_downloaded_reqs: Iterable[InstallRequirement], + metadata_only_reqs: Iterable[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -471,10 +518,9 @@ def _complete_partial_requirements( # Map each link to the requirement that owns it. This allows us to set # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. - links_to_fully_download: dict[Link, InstallRequirement] = {} - for req in partially_downloaded_reqs: - assert req.link - links_to_fully_download[req.link] = req + links_to_fully_download: dict[Link, InstallRequirement] = { + req.link: req for req in metadata_only_reqs + } batch_download = self._download.batch(links_to_fully_download.keys(), temp_dir) for link, (filepath, _) in batch_download: @@ -496,9 +542,33 @@ def _complete_partial_requirements( # This step is necessary to ensure all lazy wheels are processed # successfully by the 'download', 'wheel', and 'install' commands. - for req in partially_downloaded_reqs: + for req in metadata_only_reqs: self._prepare_linked_requirement(req, parallel_builds) + def _check_download_path(self, req: InstallRequirement) -> None: + """Check if the relevant file is already available in the download directory. + + If so, check its hash, and delete the file if the hash doesn't match.""" + if self.download_dir is None: + return + if not req.link.is_wheel: + return + + hashes = self._get_linked_req_hashes(req) + if file_path := _check_download_dir( + req.link, + self.download_dir, + hashes, + # When a locally built wheel has been found in cache, we don't warn + # about re-downloading when the already downloaded wheel hash does + # not match. This is because the hash must be checked against the + # original link, not the cached link. It that case the already + # downloaded file will be removed and re-fetched from cache (which + # implies a hash check against the cache entry's origin.json). + warn_on_hash_mismatch=not req.is_wheel_from_cache, + ): + self._downloaded[req.link.url] = file_path + def prepare_linked_requirement( self, req: InstallRequirement, parallel_builds: bool = False ) -> BaseDistribution: @@ -506,35 +576,15 @@ def prepare_linked_requirement( assert req.link self._log_preparing_link(req) with indent_log(): - # Check if the relevant file is already available - # in the download directory - file_path = None - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir( - req.link, - self.download_dir, - hashes, - # When a locally built wheel has been found in cache, we don't warn - # about re-downloading when the already downloaded wheel hash does - # not match. This is because the hash must be checked against the - # original link, not the cached link. It that case the already - # downloaded file will be removed and re-fetched from cache (which - # implies a hash check against the cache entry's origin.json). - warn_on_hash_mismatch=not req.is_wheel_from_cache, - ) + # See if the file is already downloaded, and check its hash if so. + self._check_download_path(req) - if file_path is not None: - # The file is already available, so mark it as downloaded - self._downloaded[req.link.url] = file_path - else: - # The file is not available, attempt to fetch only metadata - metadata_dist = self._fetch_metadata_only(req) - if metadata_dist is not None: - # These reqs now have the dependency information from the downloaded - # metadata, without having downloaded the actual dist at all. - req.cache_virtual_metadata_only_dist(metadata_dist) - return metadata_dist + # First try to fetch only metadata. + if metadata_dist := self._fetch_metadata_only(req): + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) + return metadata_dist # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) @@ -543,73 +593,37 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - partially_downloaded_reqs: list[InstallRequirement] = [] - for req in reqs: - if req.is_concrete: - continue - - # Determine if any of these requirements were already downloaded. - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - # If the file is there, but doesn't match the hash, delete it and print - # a warning. We will be downloading it again via - # partially_downloaded_reqs. - file_path = _check_download_dir( - req.link, self.download_dir, hashes, warn_on_hash_mismatch=True - ) - if file_path is not None: - # If the hash does match, then we still need to generate a concrete - # dist, but we don't have to download the wheel again. - self._downloaded[req.link.url] = file_path - - partially_downloaded_reqs.append(req) - + metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( - partially_downloaded_reqs, + metadata_only_reqs, parallel_builds=parallel_builds, ) - def _prepare_linked_requirement( - self, req: InstallRequirement, parallel_builds: bool - ) -> BaseDistribution: - assert req.link - link = req.link - - hashes = self._get_linked_req_hashes(req) - - if hashes and req.is_wheel_from_cache: - assert req.download_info is not None - assert link.is_wheel - assert link.is_file - # We need to verify hashes, and we have found the requirement in the cache - # of locally built wheels. - if ( - isinstance(req.download_info.info, ArchiveInfo) - and req.download_info.info.hashes - and hashes.has_one_of(req.download_info.info.hashes) - ): - # At this point we know the requirement was built from a hashable source - # artifact, and we verified that the cache entry's hash of the original - # artifact matches one of the hashes we expect. We don't verify hashes - # against the cached wheel, because the wheel is not the original. - hashes = None - else: - logger.warning( - "The hashes of the source archive found in cache entry " - "don't match, ignoring cached built wheel " - "and re-downloading source." - ) - req.link = req.cached_wheel_source_link - link = req.link + def _ensure_local_file_path( + self, req: InstallRequirement, hashes: Hashes | None + ) -> None: + """Ensure that ``req.link`` is downloaded locally, matches the expected hash, + and that ``req.local_file_path`` is set to the download location.""" + if req.link.is_existing_dir(): + return - self._ensure_link_req_src_dir(req, parallel_builds) + # NB: req.local_file_path may be set already, if it was: + # (1) sourced from a local file (such as a local .tar.gz path), + # (2) also in the wheel cache (e.g. built from an sdist). + # We will overwrite it if so, since the local file path will still point to the + # .tar.gz source instead of the wheel cache entry. - if link.is_existing_dir(): - local_file = None - elif link.url not in self._downloaded: + local_file: File | None = None + # The file may have already been downloaded in batch if it was + # a metadata-only requirement, or if it was already in the download directory. + if file_path := self._downloaded.get(req.link.url, None): + if hashes: + hashes.check_against_path(file_path) + local_file = File(file_path, content_type=None) + else: try: local_file = unpack_url( - link, + req.link, req.source_dir, self._download, self.verbosity, @@ -619,39 +633,13 @@ def _prepare_linked_requirement( except NetworkConnectionError as exc: raise InstallationError( f"Could not install requirement {req} because of HTTP " - f"error {exc} for URL {link}" - ) - else: - file_path = self._downloaded[link.url] - if hashes: - hashes.check_against_path(file_path) - local_file = File(file_path, content_type=None) + f"error {exc} for URL {req.link}" + ) from exc - # If download_info is set, we got it from the wheel cache. - if req.download_info is None: - # Editables don't go through this function (see - # prepare_editable_requirement). - assert not req.editable - req.download_info = direct_url_from_link(link, req.source_dir) - # Make sure we have a hash in download_info. If we got it as part of the - # URL, it will have been verified and we can rely on it. Otherwise we - # compute it from the downloaded file. - # FIXME: https://github.com/pypa/pip/issues/11943 - if ( - isinstance(req.download_info.info, ArchiveInfo) - and not req.download_info.info.hashes - and local_file - ): - hash = hash_file(local_file.path)[0].hexdigest() - # We populate info.hash for backward compatibility. - # This will automatically populate info.hashes. - req.download_info.info.hash = f"sha256={hash}" - - # For use in later processing, - # preserve the file path on the requirement. - if local_file: + if local_file is not None: req.local_file_path = local_file.path + def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: dist = _get_prepared_distribution( req, self.build_tracker, @@ -659,8 +647,60 @@ def _prepare_linked_requirement( self.build_isolation, self.check_build_deps, ) + assert dist.is_concrete + assert req.is_concrete + assert req.get_dist() is dist return dist + def _prepare_linked_requirement( + self, req: InstallRequirement, parallel_builds: bool + ) -> BaseDistribution: + """Ensure the dist pointing to the fully-resolved requirement is downloaded and + installable.""" + assert req.link, "this requirement must have a download link to fully prepare" + + hashes: Hashes | None = self._get_linked_req_hashes(req) + + if hashes and req.is_wheel_from_cache: + hashes = self._rewrite_link_and_hashes_for_cached_wheel(req, hashes) + + # req.source_dir is only set for editable requirements. We don't need to unpack + # wheels, so no need for a source directory. + if not req.link.is_wheel: + self._ensure_link_req_src_dir(req, parallel_builds) + + # Ensure the dist is downloaded, check its hash, and unpack it into the source + # directory (if applicable). + self._ensure_local_file_path(req, hashes) + + # Set req.download_info for --report output. + if req.download_info is None: + # If download_info is set, we got it from the wheel cache. + self._ensure_download_info(req) + + # Build (if necessary) and prepare the distribution for installation. + return self._prepare_and_finalize_dist(req) + + def _ensure_download_info(self, req: InstallRequirement) -> None: + """Ensure that ``req.download_info`` is set, for uses such as --report.""" + assert req.download_info is None + # Editables don't go through this function (see prepare_editable_requirement). + assert not req.editable + req.download_info = direct_url_from_link(req.link, req.source_dir) + # Make sure we have a hash in download_info. If we got it as part of the + # URL, it will have been verified and we can rely on it. Otherwise we + # compute it from the downloaded file. + # FIXME: https://github.com/pypa/pip/issues/11943 + if ( + isinstance(req.download_info.info, ArchiveInfo) + and not req.download_info.info.hashes + and req.local_file_path + ): + hash = hash_file(req.local_file_path)[0].hexdigest() + # We populate info.hash for backward compatibility. + # This will automatically populate info.hashes. + req.download_info.info.hash = f"sha256={hash}" + def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None @@ -708,20 +748,8 @@ def prepare_editable_requirement( req.update_editable() assert req.source_dir req.download_info = direct_url_for_editable(req.unpacked_source_directory) - - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.build_env_installer, - self.build_isolation, - self.check_build_deps, - ) - req.check_if_exists(self.use_user_site) - - # This should already have been populated by the preparation of the source dist. - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req) def prepare_installed_requirement( self, @@ -745,13 +773,4 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.build_env_installer, - self.build_isolation, - self.check_build_deps, - ) - - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req) From 93830caea724129256a721161b648242e123657b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:40:08 -0400 Subject: [PATCH 3/5] use .metadata distribution info when possible When performing `install --dry-run` and PEP 658 .metadata files are available to guide the resolve, do not download the associated wheels. Rather use the distribution information directly from the .metadata files when reporting the results on the CLI and in the --report file. - describe the new --dry-run behavior - finalize linked requirements immediately after resolve - introduce is_concrete - funnel InstalledDistribution through _get_prepared_distribution() too - add test for new install --dry-run functionality (no downloading) --- news/12186.bugfix.rst | 1 + src/pip/_internal/commands/download.py | 5 +- src/pip/_internal/commands/install.py | 7 +- src/pip/_internal/commands/wheel.py | 5 +- src/pip/_internal/operations/check.py | 5 +- src/pip/_internal/operations/prepare.py | 59 ++++- src/pip/_internal/req/req_install.py | 5 +- .../resolution/resolvelib/resolver.py | 5 - tests/conftest.py | 37 ++- tests/functional/test_download.py | 3 +- tests/functional/test_install_metadata.py | 239 ++++++++++++++++++ 11 files changed, 345 insertions(+), 26 deletions(-) create mode 100644 news/12186.bugfix.rst create mode 100644 tests/functional/test_install_metadata.py diff --git a/news/12186.bugfix.rst b/news/12186.bugfix.rst new file mode 100644 index 00000000000..b51d84a0cb2 --- /dev/null +++ b/news/12186.bugfix.rst @@ -0,0 +1 @@ +Avoid downloading any dists in ``install --dry-run`` if PEP 658 ``.metadata`` files or lazy wheels are available. diff --git a/src/pip/_internal/commands/download.py b/src/pip/_internal/commands/download.py index 900fb403d6f..a79fdec3050 100644 --- a/src/pip/_internal/commands/download.py +++ b/src/pip/_internal/commands/download.py @@ -129,6 +129,9 @@ def run(self, options: Values, args: list[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) downloaded: list[str] = [] for req in requirement_set.requirements.values(): @@ -137,8 +140,6 @@ def run(self, options: Values, args: list[str]) -> int: preparer.save_linked_requirement(req) downloaded.append(req.name) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - if downloaded: write_output("Successfully downloaded %s", " ".join(downloaded)) diff --git a/src/pip/_internal/commands/install.py b/src/pip/_internal/commands/install.py index 1ef7a0f4410..3f5d3e6c256 100644 --- a/src/pip/_internal/commands/install.py +++ b/src/pip/_internal/commands/install.py @@ -99,7 +99,8 @@ def add_options(self) -> None: help=( "Don't actually install anything, just print what would be. " "Can be used in combination with --ignore-installed " - "to 'resolve' the requirements." + "to 'resolve' the requirements. If package metadata is available " + "or cached, --dry-run also avoids downloading the dependency at all." ), ) self.cmd_opts.add_option( @@ -393,6 +394,10 @@ def run(self, options: Values, args: list[str]) -> int: requirement_set = resolver.resolve( reqs, check_supported_wheels=not options.target_dir ) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), + require_dist_files=not options.dry_run, + ) if options.json_report_file: report = InstallationReport(requirement_set.requirements_to_install) diff --git a/src/pip/_internal/commands/wheel.py b/src/pip/_internal/commands/wheel.py index 61be254912f..9519a940fc9 100644 --- a/src/pip/_internal/commands/wheel.py +++ b/src/pip/_internal/commands/wheel.py @@ -144,6 +144,9 @@ def run(self, options: Values, args: list[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) reqs_to_build: list[InstallRequirement] = [] for req in requirement_set.requirements.values(): @@ -152,8 +155,6 @@ def run(self, options: Values, args: list[str]) -> int: else: reqs_to_build.append(req) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - # build wheels build_successes, build_failures = build( reqs_to_build, diff --git a/src/pip/_internal/operations/check.py b/src/pip/_internal/operations/check.py index 2d71fa5fff5..1ceca5c4f57 100644 --- a/src/pip/_internal/operations/check.py +++ b/src/pip/_internal/operations/check.py @@ -17,7 +17,6 @@ from pip._vendor.packaging.utils import NormalizedName, canonicalize_name from pip._vendor.packaging.version import Version -from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.metadata import get_default_environment from pip._internal.metadata.base import BaseDistribution from pip._internal.req.req_install import InstallRequirement @@ -148,8 +147,8 @@ def _simulate_installation_of( # Modify it as installing requirement_set would (assuming no errors) for inst_req in to_install: - abstract_dist = make_distribution_for_install_requirement(inst_req) - dist = abstract_dist.get_metadata_distribution() + assert inst_req.is_concrete + dist = inst_req.get_dist() name = dist.canonical_name package_set[name] = PackageDetails(dist.version, list(dist.iter_dependencies())) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index fd600d16272..9e920e9218b 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -589,16 +589,71 @@ def prepare_linked_requirement( # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) - def prepare_linked_requirements_more( + def _extract_download_info(self, reqs: Iterable[InstallRequirement]) -> None: + """ + `pip install --report` extracts the download info from each requirement for its + JSON output, so we need to make sure every requirement has this before finishing + the resolve. But .download_info will only be populated by the point this method + is called for requirements already found in the wheel cache, so we need to + synthesize it for uncached results. Luckily, a DirectUrl can be parsed directly + from a url without any other context. However, this also means the download info + will only contain a hash if the link itself declares the hash. + """ + for req in reqs: + if req.download_info is None: + self._ensure_download_info(req) + + def _force_fully_prepared( + self, reqs: Iterable[InstallRequirement], assert_has_dist_files: bool + ) -> None: + """ + The legacy resolver seems to prepare requirements differently that can leave + them half-done in certain code paths. I'm not quite sure how it's doing things, + but at least we can do this to make sure they do things right. + """ + for req in reqs: + req.prepared = True + if assert_has_dist_files: + assert req.is_concrete + + def _ensure_dist_files( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: - """Prepare linked requirements more, if needed.""" + """Download any metadata-only linked requirements.""" metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( metadata_only_reqs, parallel_builds=parallel_builds, ) + def finalize_linked_requirements( + self, + reqs: Iterable[InstallRequirement], + require_dist_files: bool, + parallel_builds: bool = False, + ) -> None: + """Prepare linked requirements more, if needed. + + Neighboring .metadata files as per PEP 658 or lazy wheels via fast-deps will be + preferred to extract metadata from any concrete requirement (one that has been + mapped to a Link) without downloading the underlying wheel or sdist. When ``pip + install --dry-run`` is called, we want to avoid ever downloading the underlying + dist, but we still need to provide all of the results that pip commands expect + from the typical resolve process. + + Those expectations vary, but one distinction lies in whether the command needs + an actual physical dist somewhere on the filesystem, or just the metadata about + it from the resolver (as in ``pip install --report``). If the command requires + actual physical filesystem locations for the resolved dists, it must call this + method with ``require_dist_files=True`` to fully download anything + that remains. + """ + if require_dist_files: + self._ensure_dist_files(reqs, parallel_builds=parallel_builds) + else: + self._extract_download_info(reqs) + self._force_fully_prepared(reqs, assert_has_dist_files=require_dist_files) + def _ensure_local_file_path( self, req: InstallRequirement, hashes: Hashes | None ) -> None: diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 9e64c26d384..c284248e383 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -642,10 +642,7 @@ def cache_concrete_dist(self, dist: BaseDistribution) -> None: # If we set a dist twice for the same requirement, we must be hydrating # a concrete dist for what was previously virtual. This will occur in the # case of `install --dry-run` when PEP 658 metadata is available. - - # TODO(#12186): avoid setting dist twice! - # assert not self._dist.is_concrete - pass + assert not self._dist.is_concrete assert dist.is_concrete self._dist = dist diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index 17086ce417d..c4fd4e28f93 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -180,11 +180,6 @@ def resolve( req_set.add_named_requirement(ireq) - reqs = req_set.all_requirements - self.factory.preparer.prepare_linked_requirements_more(reqs) - for req in reqs: - req.prepared = True - assert req.is_concrete return req_set def get_installation_order( diff --git a/tests/conftest.py b/tests/conftest.py index c98b871170f..80d5ba8bb3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -753,6 +753,9 @@ class FakePackage: requires_dist: tuple[str, ...] = () # This will override the Name specified in the actual dist's METADATA. metadata_name: str | None = None + # Whether to delete the file this points to, which causes any attempt to fetch this + # package to fail unless it is processed as a metadata-only dist. + delete_linked_file: bool = False def metadata_filename(self) -> str: """This is specified by PEP 658.""" @@ -842,6 +845,27 @@ def fake_packages() -> dict[str, list[FakePackage]]: ("simple==1.0",), ), ], + "complex-dist": [ + FakePackage( + "complex-dist", + "0.1", + "complex_dist-0.1-py2.py3-none-any.whl", + MetadataKind.Unhashed, + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata presents no hash itself. + delete_linked_file=True, + ), + ], + "corruptwheel": [ + FakePackage( + "corruptwheel", + "1.0", + "corruptwheel-1.0-py2.py3-none-any.whl", + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata *does* present a hash. + MetadataKind.Sha256, + ), + ], "has-script": [ # Ensure we check PEP 658 metadata hashing errors for wheel files. FakePackage( @@ -927,10 +951,10 @@ def html_index_for_packages( f' {package_link.filename}
' # noqa: E501 ) # (3.2) Copy over the corresponding file in `shared_data.packages`. - shutil.copy( - shared_data.packages / package_link.filename, - pkg_subdir / package_link.filename, - ) + cached_file = shared_data.packages / package_link.filename + new_file = pkg_subdir / package_link.filename + if not package_link.delete_linked_file: + shutil.copy(cached_file, new_file) # (3.3) Write a metadata file, if applicable. if package_link.metadata != MetadataKind.NoFile: with open(pkg_subdir / package_link.metadata_filename(), "wb") as f: @@ -985,7 +1009,8 @@ def html_index_with_onetime_server( """Serve files from a generated pypi index, erroring if a file is downloaded more than once. - Provide `-i http://localhost:8000` to pip invocations to point them at this server. + Provide `-i http://localhost:` to pip invocations to point them at + this server. """ class InDirectoryServer(http.server.ThreadingHTTPServer): @@ -1000,7 +1025,7 @@ def finish_request(self: Self, request: Any, client_address: Any) -> None: class Handler(OneTimeDownloadHandler): _seen_paths: ClassVar[set[str]] = set() - with InDirectoryServer(("", 8000), Handler) as httpd: + with InDirectoryServer(("", 0), Handler) as httpd: server_thread = threading.Thread(target=httpd.serve_forever) server_thread.start() diff --git a/tests/functional/test_download.py b/tests/functional/test_download.py index c5887aa1bf0..555822a903e 100644 --- a/tests/functional/test_download.py +++ b/tests/functional/test_download.py @@ -1276,6 +1276,7 @@ def download_server_html_index( ) -> Callable[..., tuple[TestPipResult, Path]]: """Execute `pip download` against a generated PyPI index.""" download_dir = tmpdir / "download_dir" + server_port = html_index_with_onetime_server.server_address[1] def run_for_generated_index( args: list[str], @@ -1290,7 +1291,7 @@ def run_for_generated_index( "-d", str(download_dir), "-i", - "http://localhost:8000", + f"http://localhost:{server_port}", *args, ] result = script.pip(*pip_args, allow_error=allow_error) diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py new file mode 100644 index 00000000000..919c9e0e099 --- /dev/null +++ b/tests/functional/test_install_metadata.py @@ -0,0 +1,239 @@ +import json +import re +from collections.abc import Iterator +from pathlib import Path +from typing import Any, Callable + +import pytest + +from pip._vendor.packaging.requirements import Requirement + +from pip._internal.models.direct_url import DirectUrl +from pip._internal.utils.urls import path_to_url + +from tests.lib import ( + PipTestEnvironment, + TestPipResult, +) + + +@pytest.fixture +def install_with_generated_html_index( + script: PipTestEnvironment, + html_index_for_packages: Path, + tmpdir: Path, +) -> Callable[..., tuple[TestPipResult, dict[str, Any]]]: + """Execute `pip download` against a generated PyPI index.""" + output_file = tmpdir / "output_file.json" + + def run_for_generated_index( + args: list[str], + *, + dry_run: bool = True, + allow_error: bool = False, + ) -> tuple[TestPipResult, dict[str, Any]]: + """ + Produce a PyPI directory structure pointing to the specified packages, then + execute `pip install --report ... -i ...` pointing to our generated index. + """ + pip_args = [ + "install", + *(("--dry-run",) if dry_run else ()), + "--ignore-installed", + "--report", + str(output_file), + "-i", + path_to_url(str(html_index_for_packages)), + *args, + ] + result = script.pip(*pip_args, allow_error=allow_error) + try: + with open(output_file, "rb") as f: + report = json.load(f) + except FileNotFoundError: + if allow_error: + report = {} + else: + raise + return (result, report) + + return run_for_generated_index + + +def iter_dists(report: dict[str, Any]) -> Iterator[tuple[Requirement, DirectUrl]]: + """Parse a (req,url) tuple from each installed dist in the --report json.""" + for inst in report["install"]: + metadata = inst["metadata"] + name = metadata["name"] + version = metadata["version"] + req = Requirement(f"{name}=={version}") + direct_url = DirectUrl.from_dict(inst["download_info"]) + yield (req, direct_url) + + +@pytest.mark.parametrize( + "requirement_to_install, expected_outputs", + [ + ("simple2==1.0", ["simple2==1.0", "simple==1.0"]), + ("simple==2.0", ["simple==2.0"]), + ( + "colander", + ["colander==0.9.9", "translationstring==1.1"], + ), + ( + "compilewheel", + ["compilewheel==1.0", "simple==1.0"], + ), + ], +) +def test_install_with_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + expected_outputs: list[str], +) -> None: + """Verify that if a data-dist-info-metadata attribute is present, then it is used + instead of the actual dist's METADATA.""" + _, report = install_with_generated_html_index( + [requirement_to_install], + ) + installed = sorted(str(r) for r, _ in iter_dists(report)) + assert installed == expected_outputs + + +@pytest.mark.parametrize( + "requirement_to_install, real_hash", + [ + ( + "simple==3.0", + "95e0f200b6302989bcf2cead9465cf229168295ea330ca30d1ffeab5c0fed996", + ), + ( + "has-script", + "16ba92d7f6f992f6de5ecb7d58c914675cf21f57f8e674fb29dcb4f4c9507e5b", + ), + ], +) +def test_incorrect_metadata_hash( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + real_hash: str, +) -> None: + """Verify that if a hash for data-dist-info-metadata is provided, it must match the + actual hash of the metadata file.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_msg = f"""\ + Expected sha256 wrong-hash + Got {real_hash}""" + assert expected_msg in result.stderr + + +@pytest.mark.parametrize( + "requirement_to_install, expected_url", + [ + ("simple2==2.0", "simple2-2.0.tar.gz.metadata"), + ("priority", "priority-1.0-py2.py3-none-any.whl.metadata"), + ], +) +def test_metadata_not_found( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement_to_install: str, + expected_url: str, +) -> None: + """Verify that if a data-dist-info-metadata attribute is provided, that pip will + fetch the .metadata file at the location specified by PEP 658, and error + if unavailable.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_re = re.escape(expected_url) + pattern = re.compile( + f"ERROR: 404 Client Error: FileNotFoundError for url:.*{expected_re}" + ) + assert pattern.search(result.stderr), (pattern, result.stderr) + + +def test_produces_error_for_mismatched_package_name_in_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], +) -> None: + """Verify that the package name from the metadata matches the requested package.""" + result, _ = install_with_generated_html_index( + ["simple2==3.0"], + allow_error=True, + ) + assert result.returncode != 0 + assert ( + "simple2-3.0.tar.gz has inconsistent Name: expected 'simple2', but metadata " + "has 'not-simple2'" + ) in result.stdout + + +@pytest.mark.parametrize( + "requirement", + [ + "requires-simple-extra==0.1", + "REQUIRES_SIMPLE-EXTRA==0.1", + "REQUIRES....simple-_-EXTRA==0.1", + ], +) +def test_canonicalizes_package_name_before_verifying_metadata( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement: str, +) -> None: + """Verify that the package name from the command line and the package's + METADATA are both canonicalized before comparison, while the name from the METADATA + is always used verbatim to represent the installed candidate in --report. + + Regression test for https://github.com/pypa/pip/issues/12038 + """ + _, report = install_with_generated_html_index( + [requirement], + ) + reqs = [str(r) for r, _ in iter_dists(report)] + assert reqs == ["Requires_Simple.Extra==0.1"] + + +@pytest.mark.parametrize( + "requirement,err_string", + [ + # It's important that we verify pip won't even attempt to fetch the file, so we + # construct an input that will cause it to error if it tries at all. + ("complex-dist==0.1", "404 Client Error: FileNotFoundError"), + ("corruptwheel==1.0", ".whl is invalid."), + ], +) +def test_dry_run_avoids_downloading_metadata_only_dists( + install_with_generated_html_index: Callable[ + ..., tuple[TestPipResult, dict[str, Any]] + ], + requirement: str, + err_string: str, +) -> None: + """Verify that the underlying dist files are not downloaded at all when + `install --dry-run` is used to resolve dists with PEP 658 metadata.""" + _, report = install_with_generated_html_index( + [requirement], + ) + assert [requirement] == [str(r) for r, _ in iter_dists(report)] + result, _ = install_with_generated_html_index( + [requirement], + dry_run=False, + allow_error=True, + ) + assert result.returncode != 0 + assert err_string in result.stderr From 84c41ab8c2e6fc4682be8d2ed5a74d3267b92155 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:39:53 -0400 Subject: [PATCH 4/5] make LinkMetadataCache - catch an exception when parsing metadata which only occurs in CI - handle --no-cache-dir - call os.makedirs() before writing to cache too - catch InvalidSchema when attempting git urls with BatchDownloader - fix other test failures - reuse should_cache(req) logic - gzip compress link metadata for a slight reduction in disk space - only cache built sdists - don't check should_cache() when fetching - cache lazy wheel dists - add news - turn debug logs in fetching from cache into exceptions - use scandir over listdir when searching normal wheel cache - handle metadata email parsing errors - correctly handle mutable cached requirement - use bz2 over gzip for an extremely slight improvement in disk usage - handle new google_paste encoding breakage --- news/12256.feature.rst | 1 + src/pip/_internal/cache.py | 118 ++++++++-- src/pip/_internal/cli/cmdoptions.py | 2 + src/pip/_internal/cli/req_command.py | 13 +- src/pip/_internal/exceptions.py | 19 ++ src/pip/_internal/metadata/__init__.py | 10 +- src/pip/_internal/metadata/base.py | 22 +- .../_internal/metadata/importlib/_dists.py | 5 +- src/pip/_internal/metadata/pkg_resources.py | 5 +- src/pip/_internal/operations/prepare.py | 202 ++++++++++++++++-- src/pip/_internal/req/constructors.py | 2 +- src/pip/_internal/req/req_install.py | 25 ++- src/pip/_internal/req/req_set.py | 2 +- src/pip/_internal/wheel_builder.py | 62 +----- tests/functional/test_check.py | 20 +- tests/functional/test_install.py | 8 +- tests/functional/test_install_check.py | 2 +- tests/functional/test_install_metadata.py | 5 +- tests/functional/test_install_reqs.py | 2 +- tests/functional/test_wheel.py | 2 +- tests/unit/test_cache.py | 22 +- tests/unit/test_wheel_builder.py | 11 +- 22 files changed, 421 insertions(+), 139 deletions(-) create mode 100644 news/12256.feature.rst diff --git a/news/12256.feature.rst b/news/12256.feature.rst new file mode 100644 index 00000000000..0b5bb356188 --- /dev/null +++ b/news/12256.feature.rst @@ -0,0 +1 @@ +Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index ce98f288395..ecf1b508129 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -2,12 +2,14 @@ from __future__ import annotations +import abc import hashlib import json import logging import os +import re +from collections.abc import Iterator from pathlib import Path -from typing import Any from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name @@ -16,21 +18,71 @@ from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel +from pip._internal.req.req_install import InstallRequirement from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds from pip._internal.utils.urls import path_to_url +from pip._internal.vcs import vcs logger = logging.getLogger(__name__) +_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) + ORIGIN_JSON_NAME = "origin.json" +def _contains_egg_info(s: str) -> bool: + """Determine whether the string looks like an egg_info. + + :param s: The string to parse. E.g. foo-2.1 + """ + return bool(_egg_info_re.search(s)) + + +def should_cache( + req: InstallRequirement, +) -> bool: + """ + Return whether a built InstallRequirement can be stored in the persistent + wheel cache, assuming the wheel cache is available, and _should_build() + has determined a wheel needs to be built. + """ + if not req.link: + return False + + if req.link.is_wheel: + return False + + if req.editable or not req.source_dir: + # never cache editable requirements + return False + + if req.link and req.link.is_vcs: + # VCS checkout. Do not cache + # unless it points to an immutable commit hash. + assert not req.editable + assert req.source_dir + vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) + assert vcs_backend + if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): + return True + return False + + assert req.link + base, ext = req.link.splitext() + if _contains_egg_info(base): + return True + + # Otherwise, do not cache. + return False + + def _hash_dict(d: dict[str, str]) -> str: """Return a stable sha224 of a dictionary.""" s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True) return hashlib.sha224(s.encode("ascii")).hexdigest() -class Cache: +class Cache(abc.ABC): """An abstract class - provides cache directories for data from links :param cache_dir: The root of the cache. @@ -74,20 +126,28 @@ def _get_cache_path_parts(self, link: Link) -> list[str]: return parts - def _get_candidates(self, link: Link, canonical_package_name: str) -> list[Any]: - can_not_cache = not self.cache_dir or not canonical_package_name or not link - if can_not_cache: - return [] + @abc.abstractmethod + def get_path_for_link(self, link: Link) -> str: + """Return a directory to store cached items in for link.""" + ... + + def cache_path(self, link: Link) -> Path: + return Path(self.get_path_for_link(link)) - path = self.get_path_for_link(link) - if os.path.isdir(path): - return [(candidate, path) for candidate in os.listdir(path)] - return [] + +class LinkMetadataCache(Cache): + """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - """Return a directory to store cached items in for link.""" - raise NotImplementedError() + parts = self._get_cache_path_parts(link) + assert self.cache_dir + return os.path.join(self.cache_dir, "link-metadata", *parts) + +class WheelCacheBase(Cache): + """Specializations to the cache concept for wheels.""" + + @abc.abstractmethod def get( self, link: Link, @@ -97,10 +157,27 @@ def get( """Returns a link to a cached item if it exists, otherwise returns the passed link. """ - raise NotImplementedError() + ... + + def _can_cache(self, link: Link, canonical_package_name: str) -> bool: + return bool(self.cache_dir and canonical_package_name and link) + def _get_candidates( + self, link: Link, canonical_package_name: str + ) -> Iterator[tuple[str, str]]: + if not self._can_cache(link, canonical_package_name): + return + + path = self.get_path_for_link(link) + if not os.path.isdir(path): + return -class SimpleWheelCache(Cache): + for candidate in os.scandir(path): + if candidate.is_file(): + yield (candidate.name, path) + + +class SimpleWheelCache(WheelCacheBase): """A cache of wheels for future installs.""" def __init__(self, cache_dir: str) -> None: @@ -132,7 +209,7 @@ def get( package_name: str | None, supported_tags: list[Tag], ) -> Link: - candidates = [] + candidates: list[tuple[int, str, str]] = [] if not package_name: return link @@ -206,7 +283,7 @@ def __init__( ) -class WheelCache(Cache): +class WheelCache(WheelCacheBase): """Wraps EphemWheelCache and SimpleWheelCache into a single Cache This Cache allows for gracefully degradation, using the ephem wheel cache @@ -224,6 +301,15 @@ def get_path_for_link(self, link: Link) -> str: def get_ephem_path_for_link(self, link: Link) -> str: return self._ephem_cache.get_path_for_link(link) + def resolve_cache_dir(self, req: InstallRequirement) -> str: + """Return the persistent or temporary cache directory where the built or + downloaded wheel should be stored.""" + cache_available = bool(self.cache_dir) + assert req.link, req + if cache_available and should_cache(req): + return self.get_path_for_link(req.link) + return self.get_ephem_path_for_link(req.link) + def get( self, link: Link, diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index 3519dadf13d..68f3927d808 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -1072,6 +1072,8 @@ def check_list_path_option(options: Values) -> None: default=[], choices=[ "fast-deps", + "metadata-cache", + "truststore", ] + ALWAYS_ENABLED_FEATURES, help="Enable new functionality, that may be backward incompatible.", diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index dc1328ff019..05d9cdde176 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -13,7 +13,7 @@ from typing import Any from pip._internal.build_env import SubprocessBuildEnvironmentInstaller -from pip._internal.cache import WheelCache +from pip._internal.cache import LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.index_command import IndexGroupCommand from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin @@ -132,6 +132,16 @@ def make_requirement_preparer( "fast-deps has no effect when used with the legacy resolver." ) + if options.cache_dir and "metadata-cache" in options.features_enabled: + logger.warning( + "pip is using a local cache for metadata information. " + "This experimental feature is enabled through " + "--use-feature=metadata-cache and it is not ready for " + "production." + ) + metadata_cache = LinkMetadataCache(options.cache_dir) + else: + metadata_cache = None return RequirementPreparer( build_dir=temp_build_dir_path, src_dir=options.src_dir, @@ -149,6 +159,7 @@ def make_requirement_preparer( verbosity=verbosity, legacy_resolver=legacy_resolver, resume_retries=options.resume_retries, + metadata_cache=metadata_cache, ) @classmethod diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 98f95494c62..4d90f6877fe 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -252,6 +252,25 @@ def __str__(self) -> str: return f"None {self.metadata_name} metadata found for distribution: {self.dist}" +class CacheMetadataError(PipError): + """Raised when de/serializing a requirement into the metadata cache.""" + + def __init__( + self, + req: InstallRequirement, + reason: str, + ) -> None: + """ + :param req: The requirement we attempted to cache. + :param reason: Context about the precise error that occurred. + """ + self.req = req + self.reason = reason + + def __str__(self) -> str: + return f"{self.reason} for {self.req} from {self.req.link}" + + class UserInstallationInvalid(InstallationError): """A --user install is requested on an environment without user site.""" diff --git a/src/pip/_internal/metadata/__init__.py b/src/pip/_internal/metadata/__init__.py index 927e375cad0..368950f9946 100644 --- a/src/pip/_internal/metadata/__init__.py +++ b/src/pip/_internal/metadata/__init__.py @@ -9,7 +9,14 @@ from pip._internal.utils.deprecation import deprecated from pip._internal.utils.misc import strtobool -from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel +from .base import ( + BaseDistribution, + BaseEnvironment, + FilesystemWheel, + MemoryWheel, + Wheel, + serialize_metadata, +) __all__ = [ "BaseDistribution", @@ -21,6 +28,7 @@ "get_environment", "get_wheel_distribution", "select_backend", + "serialize_metadata", ] diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 5ed9c5d0fa6..4e0d23625cd 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -1,7 +1,10 @@ from __future__ import annotations import csv +import email.generator import email.message +import email.policy +import email.utils import functools import json import logging @@ -11,10 +14,10 @@ from collections.abc import Collection, Container, Iterable, Iterator from typing import ( IO, + TYPE_CHECKING, Any, NamedTuple, Protocol, - Union, ) from pip._vendor.packaging.requirements import Requirement @@ -36,7 +39,8 @@ from ._json import msg_to_json -InfoPath = Union[str, pathlib.PurePath] +if TYPE_CHECKING: + InfoPath = str | pathlib.PurePath logger = logging.getLogger(__name__) @@ -85,6 +89,20 @@ def _convert_installed_files_path( return str(pathlib.Path(*info, *entry)) +def serialize_metadata(msg: email.message.Message) -> str: + """Write a dist's metadata to a string. + + Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives + as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to + avoid those parsing errors.""" + # Packages such as google_pasta-0.2.0 trigger a particular encoding behavior that + # required an upstream fix (https://github.com/python/cpython/pull/117369): + # + # > email.errors.HeaderWriteError: folded header + # > contains newline: 'Description: UNKNOWN\n\n\n' + return msg.as_string(policy=email.policy.HTTP.clone(refold_source="all")) + + class RequiresEntry(NamedTuple): requirement: str extra: str diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index cb9ceb2e19e..9bd3e0766fb 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -7,6 +7,7 @@ from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence from os import PathLike from typing import ( + TYPE_CHECKING, cast, ) @@ -19,7 +20,6 @@ from pip._internal.metadata.base import ( BaseDistribution, BaseEntryPoint, - InfoPath, Wheel, ) from pip._internal.utils.misc import normalize_path @@ -33,6 +33,9 @@ parse_name_and_version_from_info_directory, ) +if TYPE_CHECKING: + from pip._internal.metadata.base import InfoPath + class WheelDistribution(importlib.metadata.Distribution): """An ``importlib.metadata.Distribution`` read from a wheel. diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index 1b9a59458f7..0158972cd3d 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -7,6 +7,7 @@ import zipfile from collections.abc import Collection, Iterable, Iterator, Mapping from typing import ( + TYPE_CHECKING, NamedTuple, ) @@ -25,10 +26,12 @@ BaseDistribution, BaseEntryPoint, BaseEnvironment, - InfoPath, Wheel, ) +if TYPE_CHECKING: + from .base import InfoPath + __all__ = ["NAME", "Distribution", "Environment"] logger = logging.getLogger(__name__) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 9e920e9218b..c037ba242b5 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -4,6 +4,8 @@ # mypy: strict-optional=False from __future__ import annotations +import bz2 +import json import mimetypes import os import shutil @@ -13,10 +15,13 @@ from typing import TYPE_CHECKING from pip._vendor.packaging.utils import canonicalize_name +from pip._vendor.requests.exceptions import InvalidSchema from pip._internal.build_env import BuildEnvironmentInstaller +from pip._internal.cache import LinkMetadataCache, should_cache from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.exceptions import ( + CacheMetadataError, DirectoryUrlHashUnsupported, HashMismatch, HashUnpinned, @@ -26,7 +31,11 @@ VcsHashUnsupported, ) from pip._internal.index.package_finder import PackageFinder -from pip._internal.metadata import BaseDistribution, get_metadata_distribution +from pip._internal.metadata import ( + BaseDistribution, + get_metadata_distribution, + serialize_metadata, +) from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel @@ -67,7 +76,7 @@ def _get_prepared_distribution( build_env_installer: BuildEnvironmentInstaller, build_isolation: bool, check_build_deps: bool, -) -> BaseDistribution: +) -> tuple[bool, BaseDistribution]: """Prepare a distribution for installation. This method will only be called by the preparer at the end of the resolve, and only @@ -78,15 +87,20 @@ def _get_prepared_distribution( This method will execute ``req.cache_concrete_dist()``, so that after invocation, ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete ``Distribution``. + + :returns: a 2-tuple: + - (bool): whether the metadata had to be constructed (e.g. from an sdist build), + - (BaseDistribution): the concrete distribution which is ready to be installed. """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id - if tracker_id is not None: + builds_metadata = tracker_id is not None + if builds_metadata: with build_tracker.track(req, tracker_id): abstract_dist.prepare_distribution_metadata( build_env_installer, build_isolation, check_build_deps ) - return abstract_dist.get_metadata_distribution() + return (builds_metadata, abstract_dist.get_metadata_distribution()) def unpack_vcs_link(link: Link, location: str, verbosity: int) -> None: @@ -233,6 +247,45 @@ def _check_download_dir( return download_path +@dataclass(frozen=True) +class CacheableDist: + metadata: str + filename: Path + canonical_name: str + + @classmethod + def from_dist(cls, link: Link, dist: BaseDistribution) -> CacheableDist: + """Extract the serializable data necessary to generate a metadata-only dist.""" + return cls( + metadata=serialize_metadata(dist.metadata), + filename=Path(link.filename), + canonical_name=dist.canonical_name, + ) + + def to_dist(self) -> BaseDistribution: + """Return a metadata-only dist from the deserialized cache entry.""" + return get_metadata_distribution( + metadata_contents=self.metadata.encode("utf-8"), + filename=str(self.filename), + canonical_name=self.canonical_name, + ) + + def to_json(self) -> dict[str, str]: + return { + "metadata": self.metadata, + "filename": str(self.filename), + "canonical_name": self.canonical_name, + } + + @classmethod + def from_json(cls, args: dict[str, str]) -> CacheableDist: + return cls( + metadata=args["metadata"], + filename=Path(args["filename"]), + canonical_name=args["canonical_name"], + ) + + class RequirementPreparer: """Prepares a Requirement""" @@ -255,6 +308,7 @@ def __init__( # noqa: PLR0913 (too many parameters) verbosity: int, legacy_resolver: bool, resume_retries: int, + metadata_cache: LinkMetadataCache | None = None, ) -> None: super().__init__() @@ -297,6 +351,8 @@ def __init__( # noqa: PLR0913 (too many parameters) # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + self._metadata_cache = metadata_cache + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -426,14 +482,96 @@ def _fetch_metadata_only( ) return None if self.require_hashes: + # Hash checking also means hashes are provided for all reqs, so no resolve + # is necessary and metadata-only fetching provides no speedup. logger.debug( "Metadata-only fetching is not used as hash checking is required", ) return None - # Try PEP 658 metadata first, then fall back to lazy wheel if unavailable. - return self._fetch_metadata_using_link_data_attr( - req - ) or self._fetch_metadata_using_lazy_wheel(req.link) + + if cached_dist := self._fetch_cached_metadata(req): + return cached_dist + # If we've used the lazy wheel approach, then PEP 658 metadata is not available. + # If the wheel is very large (>1GB), then retrieving it from the CacheControl + # HTTP cache may take multiple seconds, even on a fast computer, and the + # preparer will unnecessarily copy the cached response to disk before deleting + # it at the end of the run. Caching the dist metadata in LinkMetadataCache means + # later pip executions can retrieve metadata within milliseconds and avoid + # thrashing the disk. + # Even if we do employ PEP 658 metadata, we would still have to ping PyPI to + # ensure the .metadata file hasn't changed if we relied on CacheControl, even + # though PEP 658 metadata is guaranteed to be immutable. We optimize for this + # case by referring to our local cache. + if cached_dist := ( + self._fetch_metadata_using_link_data_attr(req) + or self._fetch_metadata_using_lazy_wheel(req) + ): + self._cache_metadata(req, cached_dist) + return cached_dist + return None + + def _locate_metadata_cache_entry(self, link: Link) -> Path | None: + """If the metadata cache is active, generate a filesystem path from the hash of + the given Link.""" + if self._metadata_cache is None: + return None + + return self._metadata_cache.cache_path(link) + + def _fetch_cached_metadata( + self, req: InstallRequirement + ) -> BaseDistribution | None: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return None + + # Quietly continue if the cache entry does not exist. + if not os.path.isfile(cached_path): + logger.debug( + "no cached metadata for link %s at %s", + req.link, + cached_path, + ) + return None + + try: + with bz2.open(cached_path, mode="rt", encoding="utf-8") as f: + logger.debug( + "found cached metadata for link %s at %s", req.link, cached_path + ) + args = json.load(f) + cached_dist = CacheableDist.from_json(args) + return cached_dist.to_dist() + except Exception: + raise CacheMetadataError(req, "error reading cached metadata") + + def _cache_metadata( + self, + req: InstallRequirement, + metadata_dist: BaseDistribution, + ) -> None: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return + + # The cache file exists already, so we have nothing to do. + if os.path.isfile(cached_path): + logger.debug( + "metadata for link %s is already cached at %s", req.link, cached_path + ) + return + + # The metadata cache is split across several subdirectories, so ensure the + # containing directory for the cache file exists before writing. + os.makedirs(str(cached_path.parent), exist_ok=True) + try: + cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) + args = cacheable_dist.to_json() + logger.debug("caching metadata for link %s at %s", req.link, cached_path) + with bz2.open(cached_path, mode="wt", encoding="utf-8") as f: + json.dump(args, f) + except Exception: + raise CacheMetadataError(req, "failed to serialize metadata") def _fetch_metadata_using_link_data_attr( self, @@ -451,6 +589,9 @@ def _fetch_metadata_using_link_data_attr( metadata_link, ) # (2) Download the contents of the METADATA file, separate from the dist itself. + # NB: this request will hit the CacheControl HTTP cache, which will be very + # quick since the METADATA file is very small. Therefore, we can rely on + # HTTP caching instead of LinkMetadataCache. metadata_file = get_http_url( metadata_link, self._download, @@ -478,36 +619,38 @@ def _fetch_metadata_using_link_data_attr( def _fetch_metadata_using_lazy_wheel( self, - link: Link, + req: InstallRequirement, ) -> BaseDistribution | None: """Fetch metadata using lazy wheel, if possible.""" # --use-feature=fast-deps must be provided. if not self.use_lazy_wheel: return None - if link.is_file or not link.is_wheel: + if req.link.is_file or not req.link.is_wheel: logger.debug( "Lazy wheel is not used as %r does not point to a remote wheel", - link, + req.link, ) return None - wheel = Wheel(link.filename) + wheel = Wheel(req.link.filename) name = canonicalize_name(wheel.name) logger.info( "Obtaining dependency information from %s %s", name, wheel.version, ) - url = link.url.split("#", 1)[0] + try: - return dist_from_wheel_url(name, url, self._session) + return dist_from_wheel_url( + name, req.link.url_without_fragment, self._session + ) except HTTPRangeRequestUnsupported: - logger.debug("%s does not support range requests", url) + logger.debug("%s does not support range requests", req.link) return None def _complete_partial_requirements( self, - metadata_only_reqs: Iterable[InstallRequirement], + metadata_only_reqs: list[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -518,9 +661,24 @@ def _complete_partial_requirements( # Map each link to the requirement that owns it. This allows us to set # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. - links_to_fully_download: dict[Link, InstallRequirement] = { - req.link: req for req in metadata_only_reqs - } + links_to_fully_download: dict[Link, InstallRequirement] = {} + for req in metadata_only_reqs: + assert req.link + + # (1) File URLs don't need to be downloaded, so skip them. + if req.link.scheme == "file": + continue + # (2) If this is e.g. a git url, we don't know how to handle that in the + # BatchDownloader, so leave it for self._prepare_linked_requirement() at + # the end of this method, which knows how to handle any URL. + can_simply_download = True + try: + # This will raise InvalidSchema if our Session can't download it. + self._session.get_adapter(req.link.url) + except InvalidSchema: + can_simply_download = False + if can_simply_download: + links_to_fully_download[req.link] = req batch_download = self._download.batch(links_to_fully_download.keys(), temp_dir) for link, (filepath, _) in batch_download: @@ -695,7 +853,7 @@ def _ensure_local_file_path( req.local_file_path = local_file.path def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: - dist = _get_prepared_distribution( + (builds_metadata, dist) = _get_prepared_distribution( req, self.build_tracker, self.build_env_installer, @@ -705,6 +863,10 @@ def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistributio assert dist.is_concrete assert req.is_concrete assert req.get_dist() is dist + + if builds_metadata and should_cache(req): + self._cache_metadata(req, dist) + return dist def _prepare_linked_requirement( diff --git a/src/pip/_internal/req/constructors.py b/src/pip/_internal/req/constructors.py index 056e7e3a7f1..121b75f0df2 100644 --- a/src/pip/_internal/req/constructors.py +++ b/src/pip/_internal/req/constructors.py @@ -554,7 +554,7 @@ def install_req_extend_extras( """ result = copy.copy(ireq) result.extras = {*ireq.extras, *extras} - result.req = ( + result._req = ( _set_requirement_extras(ireq.req, result.extras) if ireq.req is not None else None diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index c284248e383..378fd63f33b 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -91,7 +91,7 @@ def __init__( permit_editable_wheels: bool = False, ) -> None: assert req is None or isinstance(req, Requirement), req - self.req = req + self._req = req self.comes_from = comes_from self.constraint = constraint self.editable = editable @@ -204,6 +204,23 @@ def __init__( # This requirement needs to be unpacked before it can be installed. self._archive_source: Path | None = None + @property + def req(self) -> Requirement | None: + """Calculate a requirement from the cached dist, if populated. + + The cached dist can be populated by either + ``self.cache_virtual_metadata_only_dist()`` or + ``self.cache_concrete_dist()`` and can also be retrieved with + ``self.get_dist()``.""" + if self._req is not None: + return self._req + if self._dist is not None: + name = self._dist.canonical_name + version = str(self._dist.version) + self._req = Requirement(f"{name}=={version}") + return self._req + return None + def __str__(self) -> str: if self.req: s = redact_auth_from_requirement(self.req) @@ -393,7 +410,7 @@ def ensure_build_location( def _set_requirement(self) -> None: """Set requirement after generating metadata.""" - assert self.req is None + assert self._req is None assert self.metadata is not None assert self.source_dir is not None @@ -403,7 +420,7 @@ def _set_requirement(self) -> None: else: op = "===" - self.req = get_requirement( + self._req = get_requirement( "".join( [ self.metadata["Name"], @@ -429,7 +446,7 @@ def warn_on_mismatching_name(self) -> None: metadata_name, self.name, ) - self.req = get_requirement(metadata_name) + self._req = get_requirement(metadata_name) def check_if_exists(self, use_user_site: bool) -> None: """Find an installed distribution that satisfies or conflicts diff --git a/src/pip/_internal/req/req_set.py b/src/pip/_internal/req/req_set.py index 3451b24f27b..8db9b89d85f 100644 --- a/src/pip/_internal/req/req_set.py +++ b/src/pip/_internal/req/req_set.py @@ -42,7 +42,7 @@ def add_unnamed_requirement(self, install_req: InstallRequirement) -> None: self.unnamed_requirements.append(install_req) def add_named_requirement(self, install_req: InstallRequirement) -> None: - assert install_req.name + assert install_req.name, install_req project_name = canonicalize_name(install_req.name) self.requirements[project_name] = install_req diff --git a/src/pip/_internal/wheel_builder.py b/src/pip/_internal/wheel_builder.py index beed02f00c5..5ced2679e33 100644 --- a/src/pip/_internal/wheel_builder.py +++ b/src/pip/_internal/wheel_builder.py @@ -4,7 +4,6 @@ import logging import os.path -import re import shutil from collections.abc import Iterable @@ -26,23 +25,12 @@ from pip._internal.utils.subprocess import call_subprocess from pip._internal.utils.temp_dir import TempDirectory from pip._internal.utils.urls import path_to_url -from pip._internal.vcs import vcs logger = logging.getLogger(__name__) -_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) - BuildResult = tuple[list[InstallRequirement], list[InstallRequirement]] -def _contains_egg_info(s: str) -> bool: - """Determine whether the string looks like an egg_info. - - :param s: The string to parse. E.g. foo-2.1 - """ - return bool(_egg_info_re.search(s)) - - def _should_build( req: InstallRequirement, ) -> bool: @@ -67,54 +55,6 @@ def should_build_for_install_command( return _should_build(req) -def _should_cache( - req: InstallRequirement, -) -> bool | None: - """ - Return whether a built InstallRequirement can be stored in the persistent - wheel cache, assuming the wheel cache is available, and _should_build() - has determined a wheel needs to be built. - """ - if req.editable or not req.source_dir: - # never cache editable requirements - return False - - if req.link and req.link.is_vcs: - # VCS checkout. Do not cache - # unless it points to an immutable commit hash. - assert not req.editable - assert req.source_dir - vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) - assert vcs_backend - if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): - return True - return False - - assert req.link - base, ext = req.link.splitext() - if _contains_egg_info(base): - return True - - # Otherwise, do not cache. - return False - - -def _get_cache_dir( - req: InstallRequirement, - wheel_cache: WheelCache, -) -> str: - """Return the persistent or temporary cache directory where the built - wheel need to be stored. - """ - cache_available = bool(wheel_cache.cache_dir) - assert req.link - if cache_available and _should_cache(req): - cache_dir = wheel_cache.get_path_for_link(req.link) - else: - cache_dir = wheel_cache.get_ephem_path_for_link(req.link) - return cache_dir - - def _verify_one(req: InstallRequirement, wheel_path: str) -> None: canonical_name = canonicalize_name(req.name or "") w = Wheel(os.path.basename(wheel_path)) @@ -295,7 +235,7 @@ def build( build_successes, build_failures = [], [] for req in requirements: assert req.name - cache_dir = _get_cache_dir(req, wheel_cache) + cache_dir = wheel_cache.resolve_cache_dir(req) wheel_file = _build_one( req, cache_dir, diff --git a/tests/functional/test_check.py b/tests/functional/test_check.py index 06ed1b08eac..f3c309b0f45 100644 --- a/tests/functional/test_check.py +++ b/tests/functional/test_check.py @@ -123,10 +123,7 @@ def test_check_complicated_name_missing(script: PipTestEnvironment) -> None: # Without dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) @@ -149,10 +146,7 @@ def test_check_complicated_name_broken(script: PipTestEnvironment) -> None: # With broken dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -185,10 +179,7 @@ def test_check_complicated_name_clean(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -216,10 +207,7 @@ def test_check_considers_conditional_reqs(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) diff --git a/tests/functional/test_install.py b/tests/functional/test_install.py index a1bd81d31d0..5092c28a218 100644 --- a/tests/functional/test_install.py +++ b/tests/functional/test_install.py @@ -2128,7 +2128,7 @@ def test_install_conflict_results_in_warning( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency result2 = script.pip( @@ -2138,7 +2138,7 @@ def test_install_conflict_results_in_warning( allow_stderr_error=True, ) assert "pkga 1.0 requires pkgb==1.0" in result2.stderr, str(result2) - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_install_conflict_warning_can_be_suppressed( @@ -2158,11 +2158,11 @@ def test_install_conflict_warning_can_be_suppressed( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency; suppressing warning result2 = script.pip("install", "--no-index", pkgB_path, "--no-warn-conflicts") - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_target_install_ignores_distutils_config_install_prefix( diff --git a/tests/functional/test_install_check.py b/tests/functional/test_install_check.py index a0598fe2703..7f40a711d73 100644 --- a/tests/functional/test_install_check.py +++ b/tests/functional/test_install_check.py @@ -28,7 +28,7 @@ def test_check_install_canonicalization(script: PipTestEnvironment) -> None: # Let's install pkgA without its dependency result = script.pip("install", "--no-index", pkga_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result.stdout, str(result) + assert "Successfully installed pkga-1.0" in result.stdout, str(result) # Install the first missing dependency. Only an error for the # second dependency should remain. diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py index 919c9e0e099..8a5e28b0b1b 100644 --- a/tests/functional/test_install_metadata.py +++ b/tests/functional/test_install_metadata.py @@ -213,7 +213,10 @@ def test_canonicalizes_package_name_before_verifying_metadata( [ # It's important that we verify pip won't even attempt to fetch the file, so we # construct an input that will cause it to error if it tries at all. - ("complex-dist==0.1", "404 Client Error: FileNotFoundError"), + ( + "complex-dist==0.1", + "Could not install packages due to an OSError: [Errno 2] No such file or directory", # noqa: E501 + ), ("corruptwheel==1.0", ".whl is invalid."), ], ) diff --git a/tests/functional/test_install_reqs.py b/tests/functional/test_install_reqs.py index 3c5b6db4a68..66497c9eb70 100644 --- a/tests/functional/test_install_reqs.py +++ b/tests/functional/test_install_reqs.py @@ -728,7 +728,7 @@ def test_install_distribution_full_union( result = script.pip_install_local( to_install, f"{to_install}[bar]", f"{to_install}[baz]" ) - assert "Building wheel for LocalExtras" in result.stdout + assert "Building wheel for localextras" in result.stdout result.did_create(script.site_packages / "simple") result.did_create(script.site_packages / "singlemodule.py") diff --git a/tests/functional/test_wheel.py b/tests/functional/test_wheel.py index e1ede880496..3496522c4b0 100644 --- a/tests/functional/test_wheel.py +++ b/tests/functional/test_wheel.py @@ -322,7 +322,7 @@ def test_wheel_package_with_latin1_setup( pkg_to_wheel = data.packages.joinpath("SetupPyLatin1") result = script.pip("wheel", pkg_to_wheel) - assert "Successfully built SetupPyUTF8" in result.stdout + assert "Successfully built setuppyutf8" in result.stdout def test_pip_wheel_with_pep518_build_reqs( diff --git a/tests/unit/test_cache.py b/tests/unit/test_cache.py index 30cdb6ebece..d7dc91bfcbb 100644 --- a/tests/unit/test_cache.py +++ b/tests/unit/test_cache.py @@ -1,13 +1,33 @@ import os from pathlib import Path +import pytest + from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version -from pip._internal.cache import WheelCache, _hash_dict +from pip._internal.cache import WheelCache, _contains_egg_info, _hash_dict from pip._internal.models.link import Link from pip._internal.utils.misc import ensure_dir +@pytest.mark.parametrize( + "s, expected", + [ + # Trivial. + ("pip-18.0", True), + # Ambiguous. + ("foo-2-2", True), + ("im-valid", True), + # Invalid. + ("invalid", False), + ("im_invalid", False), + ], +) +def test_contains_egg_info(s: str, expected: bool) -> None: + result = _contains_egg_info(s) + assert result == expected + + def test_falsey_path_none() -> None: wc = WheelCache("") assert wc.cache_dir is None diff --git a/tests/unit/test_wheel_builder.py b/tests/unit/test_wheel_builder.py index 0547ac818bc..b2ed3a89b28 100644 --- a/tests/unit/test_wheel_builder.py +++ b/tests/unit/test_wheel_builder.py @@ -8,7 +8,8 @@ import pytest -from pip._internal import wheel_builder +from pip._internal import cache, wheel_builder +from pip._internal.cache import _contains_egg_info from pip._internal.models.link import Link from pip._internal.operations.build.wheel_legacy import format_command_result from pip._internal.req.req_install import InstallRequirement @@ -31,7 +32,7 @@ ], ) def test_contains_egg_info(s: str, expected: bool) -> None: - result = wheel_builder._contains_egg_info(s) + result = _contains_egg_info(s) assert result == expected @@ -96,7 +97,7 @@ def test_should_build_for_install_command(req: ReqMock, expected: bool) -> None: ], ) def test_should_cache(req: ReqMock, expected: bool) -> None: - assert wheel_builder._should_cache(cast(InstallRequirement, req)) is expected + assert cache.should_cache(cast(InstallRequirement, req)) is expected def test_should_cache_git_sha(tmpdir: Path) -> None: @@ -106,12 +107,12 @@ def test_should_cache_git_sha(tmpdir: Path) -> None: # a link referencing a sha should be cached url = "git+https://g.c/o/r@" + commit + "#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert wheel_builder._should_cache(cast(InstallRequirement, req)) + assert cache.should_cache(cast(InstallRequirement, req)) # a link not referencing a sha should not be cached url = "git+https://g.c/o/r@master#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert not wheel_builder._should_cache(cast(InstallRequirement, req)) + assert not cache.should_cache(cast(InstallRequirement, req)) def test_format_command_result__INFO(caplog: pytest.LogCaptureFixture) -> None: From cbff460585a423ac0dee2b95fae6a2a1ff70d777 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 20:09:20 -0400 Subject: [PATCH 5/5] make FetchResolveCache - pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS - remove quotes from etag and use binary checksum to save a few bytes - parse http modified date to compress the cached representation - fix cache-control clobbering --- news/12257.feature.rst | 1 + src/pip/_internal/cache.py | 32 ++- src/pip/_internal/cli/req_command.py | 7 +- src/pip/_internal/index/collector.py | 50 +++-- src/pip/_internal/index/package_finder.py | 243 +++++++++++++++++++++- src/pip/_internal/models/link.py | 6 +- tests/unit/test_collector.py | 12 +- 7 files changed, 311 insertions(+), 40 deletions(-) create mode 100644 news/12257.feature.rst diff --git a/news/12257.feature.rst b/news/12257.feature.rst new file mode 100644 index 00000000000..21ea8cf5942 --- /dev/null +++ b/news/12257.feature.rst @@ -0,0 +1 @@ +Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index ecf1b508129..0f1c54c328c 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -93,7 +93,9 @@ def __init__(self, cache_dir: str) -> None: assert not cache_dir or os.path.isabs(cache_dir) self.cache_dir = cache_dir or None - def _get_cache_path_parts(self, link: Link) -> list[str]: + def _get_cache_path_parts( + self, link: Link, *, interpreter_dependent: bool + ) -> list[str]: """Get parts of part that must be os.path.joined with cache_dir""" # We want to generate an url to use as our cache key, we don't want to @@ -105,13 +107,14 @@ def _get_cache_path_parts(self, link: Link) -> list[str]: if link.subdirectory_fragment: key_parts["subdirectory"] = link.subdirectory_fragment - # Include interpreter name, major and minor version in cache key - # to cope with ill-behaved sdists that build a different wheel - # depending on the python version their setup.py is being run on, - # and don't encode the difference in compatibility tags. - # https://github.com/pypa/pip/issues/7296 - key_parts["interpreter_name"] = interpreter_name() - key_parts["interpreter_version"] = interpreter_version() + if interpreter_dependent: + # Include interpreter name, major and minor version in cache key + # to cope with ill-behaved sdists that build a different wheel + # depending on the python version their setup.py is being run on, + # and don't encode the difference in compatibility tags. + # https://github.com/pypa/pip/issues/7296 + key_parts["interpreter_name"] = interpreter_name() + key_parts["interpreter_version"] = interpreter_version() # Encode our key url with sha224, we'll use this because it has similar # security properties to sha256, but with a shorter total output (and @@ -139,11 +142,20 @@ class LinkMetadataCache(Cache): """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir return os.path.join(self.cache_dir, "link-metadata", *parts) +class FetchResolveCache(Cache): + def get_path_for_link(self, link: Link) -> str: + # We are reading index links to extract other links from, not executing any + # python code, so these caches are interpreter-independent. + parts = self._get_cache_path_parts(link, interpreter_dependent=False) + assert self.cache_dir + return os.path.join(self.cache_dir, "fetch-resolve", *parts) + + class WheelCacheBase(Cache): """Specializations to the cache concept for wheels.""" @@ -198,7 +210,7 @@ def get_path_for_link(self, link: Link) -> str: :param link: The link of the sdist for which this will cache wheels. """ - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir # Store wheels within the root cache_dir return os.path.join(self.cache_dir, "wheels", *parts) diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 05d9cdde176..864cc2959c5 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -13,7 +13,7 @@ from typing import Any from pip._internal.build_env import SubprocessBuildEnvironmentInstaller -from pip._internal.cache import LinkMetadataCache, WheelCache +from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.index_command import IndexGroupCommand from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin @@ -355,8 +355,13 @@ def _build_package_finder( ignore_requires_python=ignore_requires_python, ) + if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled): + fetch_resolve_cache = FetchResolveCache(options.cache_dir) + else: + fetch_resolve_cache = None return PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + fetch_resolve_cache=fetch_resolve_cache, ) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 00d66daa3bf..fbbeaf5a23e 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -87,7 +87,9 @@ class _NotHTTP(Exception): pass -def _ensure_api_response(url: str, session: PipSession) -> None: +def _ensure_api_response( + url: str, session: PipSession, headers: dict[str, str] | None = None +) -> None: """ Send a HEAD request to the URL, and ensure the response contains a simple API Response. @@ -99,13 +101,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None: if scheme not in {"http", "https"}: raise _NotHTTP() - resp = session.head(url, allow_redirects=True) + resp = session.head(url, allow_redirects=True, headers=headers) raise_for_status(resp) _ensure_api_header(resp) -def _get_simple_response(url: str, session: PipSession) -> Response: +def _get_simple_response( + url: str, session: PipSession, headers: dict[str, str] | None = None +) -> Response: """Access an Simple API response with GET, and return the response. This consists of three parts: @@ -119,10 +123,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response: and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_api_response(url, session=session) + _ensure_api_response(url, session=session, headers=headers) logger.debug("Getting page %s", redact_auth_from_url(url)) + logger.debug("headers: %s", str(headers)) + if headers is None: + headers = {} resp = session.get( url, headers={ @@ -147,6 +154,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response: # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", + **headers, }, ) raise_for_status(resp) @@ -225,7 +233,7 @@ def parse_links(page: IndexContent) -> Iterable[Link]: if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json(file, page.url, page_content=page) if link is None: continue yield link @@ -238,7 +246,9 @@ def parse_links(page: IndexContent) -> Iterable[Link]: url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + link = Link.from_element( + anchor, page_url=url, base_url=base_url, page_content=page + ) if link is None: continue yield link @@ -253,6 +263,8 @@ class IndexContent: :param cache_link_parsing: whether links parsed from this page's url should be cached. PyPI index urls should have this set to False, for example. + :param etag: The ``ETag`` header from an HTTP request against ``url``. + :param date: The ``Date`` header from an HTTP request against ``url``. """ content: bytes @@ -260,6 +272,8 @@ class IndexContent: encoding: str | None url: str cache_link_parsing: bool = True + etag: str | None = None + date: str | None = None def __str__(self) -> str: return redact_auth_from_url(self.url) @@ -304,7 +318,8 @@ def _handle_get_simple_fail( def _make_index_content( - response: Response, cache_link_parsing: bool = True + response: Response, + cache_link_parsing: bool = True, ) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) return IndexContent( @@ -313,11 +328,15 @@ def _make_index_content( encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, + etag=response.headers.get("ETag", None), + date=response.headers.get("Date", None), ) -def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | None: - url = link.url.split("#", 1)[0] +def _get_index_content( + link: Link, *, session: PipSession, headers: dict[str, str] | None = None +) -> IndexContent | None: + url = link.url_without_fragment # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) @@ -344,7 +363,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non logger.debug(" file: URL is directory, getting %s", url) try: - resp = _get_simple_response(url, session=session) + resp = _get_simple_response(url, session=session, headers=headers) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " @@ -360,9 +379,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non exc.request_desc, exc.content_type, ) - except NetworkConnectionError as exc: - _handle_get_simple_fail(link, exc) - except RetryError as exc: + except (NetworkConnectionError, RetryError) as exc: _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " @@ -436,11 +453,14 @@ def create( def find_links(self) -> list[str]: return self.search_scope.find_links - def fetch_response(self, location: Link) -> IndexContent | None: + def fetch_response( + self, location: Link, headers: dict[str, str] | None = None + ) -> IndexContent | None: """ Fetch an HTML page containing package links. """ - return _get_index_content(location, session=self.session) + logger.debug("headers: %s", str(headers)) + return _get_index_content(location, session=self.session, headers=headers) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index bc523cd42d8..633f1eb855e 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -2,17 +2,21 @@ from __future__ import annotations +import binascii +import datetime import enum import functools import itertools import logging +import os import re +import time from collections.abc import Iterable from dataclasses import dataclass +from hashlib import sha256 +from pathlib import Path from typing import ( TYPE_CHECKING, - Optional, - Union, ) from pip._vendor.packaging import specifiers @@ -21,13 +25,14 @@ from pip._vendor.packaging.version import InvalidVersion, _BaseVersion from pip._vendor.packaging.version import parse as parse_version +from pip._internal.cache import FetchResolveCache from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, InvalidWheelFilename, UnsupportedWheel, ) -from pip._internal.index.collector import LinkCollector, parse_links +from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link @@ -47,14 +52,14 @@ if TYPE_CHECKING: from typing_extensions import TypeGuard + BuildTag = tuple[()] | tuple[int, str] + CandidateSortingKey = tuple[int, int, int, _BaseVersion, int | None, BuildTag] + __all__ = ["FormatControl", "BestCandidateResult", "PackageFinder"] logger = getLogger(__name__) -BuildTag = Union[tuple[()], tuple[int, str]] -CandidateSortingKey = tuple[int, int, int, _BaseVersion, Optional[int], BuildTag] - def _check_link_requires_python( link: Link, @@ -593,6 +598,7 @@ def __init__( format_control: FormatControl | None = None, candidate_prefs: CandidatePreferences | None = None, ignore_requires_python: bool | None = None, + fetch_resolve_cache: FetchResolveCache | None = None, ) -> None: """ This constructor is primarily meant to be used by the create() class @@ -627,6 +633,8 @@ def __init__( BestCandidateResult, ] = {} + self._fetch_resolve_cache = fetch_resolve_cache + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -637,6 +645,7 @@ def create( link_collector: LinkCollector, selection_prefs: SelectionPreferences, target_python: TargetPython | None = None, + fetch_resolve_cache: FetchResolveCache | None = None, ) -> PackageFinder: """Create a PackageFinder. @@ -661,6 +670,7 @@ def create( allow_yanked=selection_prefs.allow_yanked, format_control=selection_prefs.format_control, ignore_requires_python=selection_prefs.ignore_requires_python, + fetch_resolve_cache=fetch_resolve_cache, ) @property @@ -800,18 +810,235 @@ def evaluate_links( return candidates - def process_project_url( + _HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S %Z" + + @classmethod + def _try_load_http_cache_headers( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + headers: dict[str, str], + ) -> tuple[str | None, datetime.datetime | None, bytes | None]: + etag: str | None = None + try: + etag = etag_path.read_text() + etag = f'"{etag}"' + logger.debug( + "found cached etag for url %s at %s: %s", + project_url, + etag_path, + etag, + ) + headers["If-None-Match"] = etag + except OSError as e: + logger.debug("no etag found for url %s (%s)", project_url, str(e)) + + date: datetime.datetime | None = None + try: + date_bytes = date_path.read_bytes() + date_int = int.from_bytes(date_bytes, byteorder="big", signed=False) + date = datetime.datetime.fromtimestamp(date_int, tz=datetime.timezone.utc) + logger.debug( + "found cached date for url %s at %s: '%s'", + project_url, + date_path, + date, + ) + headers["If-Modified-Since"] = date.strftime(cls._HTTP_DATE_FORMAT) + except OSError as e: + logger.debug("no date found for url %s (%s)", project_url, str(e)) + + checksum: bytes | None = None + try: + checksum = checksum_path.read_bytes() + logger.debug( + "found checksum for url %s at %s: '%s'", + project_url, + checksum_path, + binascii.b2a_base64(checksum, newline=False).decode("ascii"), + ) + except OSError as e: + logger.debug("no checksum found for url %s (%s)", project_url, str(e)) + + return (etag, date, checksum) + + _quoted_value = re.compile(r'^"([^"]*)"$') + + @classmethod + def _strip_quoted_value(cls, value: str) -> str: + return cls._quoted_value.sub(r"\1", value) + + _now_local = datetime.datetime.now().astimezone() + _local_tz = _now_local.tzinfo + assert _local_tz is not None + _local_tz_name = _local_tz.tzname(_now_local) + + @classmethod + def _write_http_cache_info( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + index_response: IndexContent, + prev_etag: str | None, + prev_checksum: bytes | None, + ) -> tuple[str | None, datetime.datetime | None, bytes, bool]: + hasher = sha256() + hasher.update(index_response.content) + new_checksum = hasher.digest() + checksum_path.write_bytes(new_checksum) + page_unmodified = new_checksum == prev_checksum + + new_etag: str | None = index_response.etag + if new_etag is None: + logger.debug("no etag returned from fetch for url %s", project_url.url) + try: + etag_path.unlink() + except OSError: + pass + else: + new_etag = cls._strip_quoted_value(new_etag) + if new_etag != prev_etag: + logger.debug( + "etag for url %s updated from %s -> %s", + project_url.url, + prev_etag, + new_etag, + ) + etag_path.write_text(new_etag) + else: + logger.debug( + "etag was unmodified for url %s (%s)", project_url.url, prev_etag + ) + assert page_unmodified + + new_date: datetime.datetime | None = None + date_str: str | None = index_response.date + if date_str is None: + logger.debug( + "no date header was provided in response for url %s", project_url + ) + else: + date_str = date_str.strip() + new_time = time.strptime(date_str, cls._HTTP_DATE_FORMAT) + new_date = datetime.datetime.strptime(date_str, cls._HTTP_DATE_FORMAT) + # strptime() doesn't set the timezone according to the parsed %Z arg, which + # may be any of "UTC", "GMT", or any element of `time.tzname`. + if new_time.tm_zone in ["UTC", "GMT"]: + logger.debug( + "a UTC timezone was found in response for url %s", project_url + ) + new_date = new_date.replace(tzinfo=datetime.timezone.utc) + else: + assert new_time.tm_zone in time.tzname, new_time + logger.debug( + "a local timezone %s was found in response for url %s", + new_time.tm_zone, + project_url, + ) + if new_time.tm_zone == cls._local_tz_name: + new_date = new_date.replace(tzinfo=cls._local_tz) + else: + logger.debug( + "a local timezone %s had to be discarded in response %s", + new_time.tm_zone, + project_url, + ) + new_date = None + + if new_date is not None: + timestamp = new_date.timestamp() + # The timestamp will only have second resolution according to the parse + # format string _HTTP_DATE_FORMAT. + assert not (timestamp % 1), (new_date, timestamp) + epoch = int(timestamp) + assert epoch >= 0, (new_date, timestamp, epoch) + date_bytes = epoch.to_bytes(length=4, byteorder="big", signed=False) + date_path.write_bytes(date_bytes) + + logger.debug('date "%s" written for url %s', new_date, project_url) + if new_date is None: + try: + date_path.unlink() + except OSError: + pass + + return (new_etag, new_date, new_checksum, page_unmodified) + + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> list[InstallationCandidate]: logger.debug( "Fetching project page and analyzing links: %s", project_url, ) + index_response = self._link_collector.fetch_response(project_url) if index_response is None: return [] - page_links = list(parse_links(index_response)) + page_links = parse_links(index_response) + + with indent_log(): + package_links = self.evaluate_links(link_evaluator, links=page_links) + return package_links + + def process_project_url( + self, project_url: Link, link_evaluator: LinkEvaluator + ) -> list[InstallationCandidate]: + if self._fetch_resolve_cache is None: + return self._process_project_url_uncached(project_url, link_evaluator) + + cached_path = self._fetch_resolve_cache.cache_path(project_url) + os.makedirs(str(cached_path), exist_ok=True) + + etag_path = cached_path / "etag" + date_path = cached_path / "modified-since-date" + checksum_path = cached_path / "checksum" + + headers: dict[str, str] = { + # Wipe any other Cache-Control headers away--we are explicitly managing the + # caching here. + "Cache-Control": "", + } + # NB: mutates headers! + prev_etag, _prev_date, prev_checksum = self._try_load_http_cache_headers( + etag_path, date_path, checksum_path, project_url, headers + ) + + logger.debug( + "Fetching project page and analyzing links: %s", + project_url, + ) + + # A 304 Not Modified is implicitly converted into a reused cached response from + # the Cache-Control library, so we won't explicitly check for a 304. + index_response = self._link_collector.fetch_response( + project_url, + headers=headers, + ) + if index_response is None: + return [] + + ( + _new_etag, + _new_date, + _new_checksum, + page_unmodified, + ) = self._write_http_cache_info( + etag_path, + date_path, + checksum_path, + project_url, + index_response, + prev_etag=prev_etag, + prev_checksum=prev_checksum, + ) + + page_links = parse_links(index_response) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2e2c0f836ac..42c948cd955 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -281,6 +281,7 @@ def from_json( cls, file_data: dict[str, Any], page_url: str, + page_content: IndexContent | None = None, ) -> Link | None: """ Convert an pypi json document from a simple repository page into a Link. @@ -320,7 +321,7 @@ def from_json( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, hashes=hashes, @@ -333,6 +334,7 @@ def from_element( anchor_attribs: dict[str, str | None], page_url: str, base_url: str, + page_content: IndexContent | None = None, ) -> Link | None: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -373,7 +375,7 @@ def from_element( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index fa688f8e42f..56446bb7aa9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -98,7 +98,7 @@ def test_get_simple_response_archive_to_http_scheme( session.assert_has_calls( [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), ] ) mock_raise_for_status.assert_called_once_with(session.head.return_value) @@ -160,7 +160,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html( assert resp is not None assert session.mock_calls == [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), mock.call.get( url, headers={ @@ -248,7 +248,7 @@ def test_get_simple_response_dont_log_clear_text_password( assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ @@ -256,6 +256,9 @@ def test_get_simple_response_dont_log_clear_text_password( ] record = caplog.records[1] assert record.levelname == "DEBUG" + assert record.message.splitlines() == ["headers: None"] + record = caplog.records[2] + assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Fetched page https://user:****@example.com/simple/ as text/html", ] @@ -849,7 +852,7 @@ def test_get_index_content_directory_append_index(tmpdir: Path) -> None: mock_func.return_value = fake_response actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ - mock.call(expected_url, session=session), + mock.call(expected_url, headers=None, session=session), ], f"actual calls: {mock_func.mock_calls}" assert actual is not None @@ -965,6 +968,7 @@ def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None: # _get_simple_response(). mock_get_simple_response.assert_called_once_with( url, + headers=None, session=link_collector.session, )