Skip to content

Commit a6a0279

Browse files
make FetchResolveCache
- pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS - remove quotes from etag and use binary checksum to save a few bytes - parse http modified date to compress the cached representation - fix cache-control clobbering
1 parent 71a558b commit a6a0279

File tree

7 files changed

+310
-39
lines changed

7 files changed

+310
-39
lines changed

news/12257.feature.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.

src/pip/_internal/cache.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ def __init__(self, cache_dir: str) -> None:
9393
assert not cache_dir or os.path.isabs(cache_dir)
9494
self.cache_dir = cache_dir or None
9595

96-
def _get_cache_path_parts(self, link: Link) -> list[str]:
96+
def _get_cache_path_parts(
97+
self, link: Link, *, interpreter_dependent: bool
98+
) -> list[str]:
9799
"""Get parts of part that must be os.path.joined with cache_dir"""
98100

99101
# We want to generate an url to use as our cache key, we don't want to
@@ -105,13 +107,14 @@ def _get_cache_path_parts(self, link: Link) -> list[str]:
105107
if link.subdirectory_fragment:
106108
key_parts["subdirectory"] = link.subdirectory_fragment
107109

108-
# Include interpreter name, major and minor version in cache key
109-
# to cope with ill-behaved sdists that build a different wheel
110-
# depending on the python version their setup.py is being run on,
111-
# and don't encode the difference in compatibility tags.
112-
# https://github.com/pypa/pip/issues/7296
113-
key_parts["interpreter_name"] = interpreter_name()
114-
key_parts["interpreter_version"] = interpreter_version()
110+
if interpreter_dependent:
111+
# Include interpreter name, major and minor version in cache key
112+
# to cope with ill-behaved sdists that build a different wheel
113+
# depending on the python version their setup.py is being run on,
114+
# and don't encode the difference in compatibility tags.
115+
# https://github.com/pypa/pip/issues/7296
116+
key_parts["interpreter_name"] = interpreter_name()
117+
key_parts["interpreter_version"] = interpreter_version()
115118

116119
# Encode our key url with sha224, we'll use this because it has similar
117120
# security properties to sha256, but with a shorter total output (and
@@ -139,11 +142,20 @@ class LinkMetadataCache(Cache):
139142
"""Persistently store the metadata of dists found at each link."""
140143

141144
def get_path_for_link(self, link: Link) -> str:
142-
parts = self._get_cache_path_parts(link)
145+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
143146
assert self.cache_dir
144147
return os.path.join(self.cache_dir, "link-metadata", *parts)
145148

146149

150+
class FetchResolveCache(Cache):
151+
def get_path_for_link(self, link: Link) -> str:
152+
# We are reading index links to extract other links from, not executing any
153+
# python code, so these caches are interpreter-independent.
154+
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
155+
assert self.cache_dir
156+
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
157+
158+
147159
class WheelCacheBase(Cache):
148160
"""Specializations to the cache concept for wheels."""
149161

@@ -198,7 +210,7 @@ def get_path_for_link(self, link: Link) -> str:
198210
199211
:param link: The link of the sdist for which this will cache wheels.
200212
"""
201-
parts = self._get_cache_path_parts(link)
213+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
202214
assert self.cache_dir
203215
# Store wheels within the root cache_dir
204216
return os.path.join(self.cache_dir, "wheels", *parts)

src/pip/_internal/cli/req_command.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from typing import Any
1414

1515
from pip._internal.build_env import SubprocessBuildEnvironmentInstaller
16-
from pip._internal.cache import LinkMetadataCache, WheelCache
16+
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
1717
from pip._internal.cli import cmdoptions
1818
from pip._internal.cli.index_command import IndexGroupCommand
1919
from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
@@ -355,8 +355,13 @@ def _build_package_finder(
355355
ignore_requires_python=ignore_requires_python,
356356
)
357357

358+
if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
359+
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
360+
else:
361+
fetch_resolve_cache = None
358362
return PackageFinder.create(
359363
link_collector=link_collector,
360364
selection_prefs=selection_prefs,
361365
target_python=target_python,
366+
fetch_resolve_cache=fetch_resolve_cache,
362367
)

src/pip/_internal/index/collector.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ class _NotHTTP(Exception):
8787
pass
8888

8989

90-
def _ensure_api_response(url: str, session: PipSession) -> None:
90+
def _ensure_api_response(
91+
url: str, session: PipSession, headers: dict[str, str] | None = None
92+
) -> None:
9193
"""
9294
Send a HEAD request to the URL, and ensure the response contains a simple
9395
API Response.
@@ -99,13 +101,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
99101
if scheme not in {"http", "https"}:
100102
raise _NotHTTP()
101103

102-
resp = session.head(url, allow_redirects=True)
104+
resp = session.head(url, allow_redirects=True, headers=headers)
103105
raise_for_status(resp)
104106

105107
_ensure_api_header(resp)
106108

107109

108-
def _get_simple_response(url: str, session: PipSession) -> Response:
110+
def _get_simple_response(
111+
url: str, session: PipSession, headers: dict[str, str] | None = None
112+
) -> Response:
109113
"""Access an Simple API response with GET, and return the response.
110114
111115
This consists of three parts:
@@ -119,10 +123,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
119123
and raise `_NotAPIContent` otherwise.
120124
"""
121125
if is_archive_file(Link(url).filename):
122-
_ensure_api_response(url, session=session)
126+
_ensure_api_response(url, session=session, headers=headers)
123127

124128
logger.debug("Getting page %s", redact_auth_from_url(url))
125129

130+
logger.debug("headers: %s", str(headers))
131+
if headers is None:
132+
headers = {}
126133
resp = session.get(
127134
url,
128135
headers={
@@ -147,6 +154,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
147154
# once per 10 minutes.
148155
# For more information, please see pypa/pip#5670.
149156
"Cache-Control": "max-age=0",
157+
**headers,
150158
},
151159
)
152160
raise_for_status(resp)
@@ -225,7 +233,7 @@ def parse_links(page: IndexContent) -> Iterable[Link]:
225233
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
226234
data = json.loads(page.content)
227235
for file in data.get("files", []):
228-
link = Link.from_json(file, page.url)
236+
link = Link.from_json(file, page.url, page_content=page)
229237
if link is None:
230238
continue
231239
yield link
@@ -238,7 +246,9 @@ def parse_links(page: IndexContent) -> Iterable[Link]:
238246
url = page.url
239247
base_url = parser.base_url or url
240248
for anchor in parser.anchors:
241-
link = Link.from_element(anchor, page_url=url, base_url=base_url)
249+
link = Link.from_element(
250+
anchor, page_url=url, base_url=base_url, page_content=page
251+
)
242252
if link is None:
243253
continue
244254
yield link
@@ -253,13 +263,17 @@ class IndexContent:
253263
:param cache_link_parsing: whether links parsed from this page's url
254264
should be cached. PyPI index urls should
255265
have this set to False, for example.
266+
:param etag: The ``ETag`` header from an HTTP request against ``url``.
267+
:param date: The ``Date`` header from an HTTP request against ``url``.
256268
"""
257269

258270
content: bytes
259271
content_type: str
260272
encoding: str | None
261273
url: str
262274
cache_link_parsing: bool = True
275+
etag: str | None = None
276+
date: str | None = None
263277

264278
def __str__(self) -> str:
265279
return redact_auth_from_url(self.url)
@@ -304,7 +318,8 @@ def _handle_get_simple_fail(
304318

305319

306320
def _make_index_content(
307-
response: Response, cache_link_parsing: bool = True
321+
response: Response,
322+
cache_link_parsing: bool = True,
308323
) -> IndexContent:
309324
encoding = _get_encoding_from_headers(response.headers)
310325
return IndexContent(
@@ -313,11 +328,15 @@ def _make_index_content(
313328
encoding=encoding,
314329
url=response.url,
315330
cache_link_parsing=cache_link_parsing,
331+
etag=response.headers.get("ETag", None),
332+
date=response.headers.get("Date", None),
316333
)
317334

318335

319-
def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | None:
320-
url = link.url.split("#", 1)[0]
336+
def _get_index_content(
337+
link: Link, *, session: PipSession, headers: dict[str, str] | None = None
338+
) -> IndexContent | None:
339+
url = link.url_without_fragment
321340

322341
# Check for VCS schemes that do not support lookup as web pages.
323342
vcs_scheme = _match_vcs_scheme(url)
@@ -344,7 +363,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non
344363
logger.debug(" file: URL is directory, getting %s", url)
345364

346365
try:
347-
resp = _get_simple_response(url, session=session)
366+
resp = _get_simple_response(url, session=session, headers=headers)
348367
except _NotHTTP:
349368
logger.warning(
350369
"Skipping page %s because it looks like an archive, and cannot "
@@ -360,9 +379,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non
360379
exc.request_desc,
361380
exc.content_type,
362381
)
363-
except NetworkConnectionError as exc:
364-
_handle_get_simple_fail(link, exc)
365-
except RetryError as exc:
382+
except (NetworkConnectionError, RetryError) as exc:
366383
_handle_get_simple_fail(link, exc)
367384
except SSLError as exc:
368385
reason = "There was a problem confirming the ssl certificate: "
@@ -436,11 +453,14 @@ def create(
436453
def find_links(self) -> list[str]:
437454
return self.search_scope.find_links
438455

439-
def fetch_response(self, location: Link) -> IndexContent | None:
456+
def fetch_response(
457+
self, location: Link, headers: dict[str, str] | None = None
458+
) -> IndexContent | None:
440459
"""
441460
Fetch an HTML page containing package links.
442461
"""
443-
return _get_index_content(location, session=self.session)
462+
logger.debug("headers: %s", str(headers))
463+
return _get_index_content(location, session=self.session, headers=headers)
444464

445465
def collect_sources(
446466
self,

0 commit comments

Comments
 (0)