@@ -87,7 +87,9 @@ class _NotHTTP(Exception):
8787 pass
8888
8989
90- def _ensure_api_response (url : str , session : PipSession ) -> None :
90+ def _ensure_api_response (
91+ url : str , session : PipSession , headers : dict [str , str ] | None = None
92+ ) -> None :
9193 """
9294 Send a HEAD request to the URL, and ensure the response contains a simple
9395 API Response.
@@ -99,13 +101,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
99101 if scheme not in {"http" , "https" }:
100102 raise _NotHTTP ()
101103
102- resp = session .head (url , allow_redirects = True )
104+ resp = session .head (url , allow_redirects = True , headers = headers )
103105 raise_for_status (resp )
104106
105107 _ensure_api_header (resp )
106108
107109
108- def _get_simple_response (url : str , session : PipSession ) -> Response :
110+ def _get_simple_response (
111+ url : str , session : PipSession , headers : dict [str , str ] | None = None
112+ ) -> Response :
109113 """Access an Simple API response with GET, and return the response.
110114
111115 This consists of three parts:
@@ -119,10 +123,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
119123 and raise `_NotAPIContent` otherwise.
120124 """
121125 if is_archive_file (Link (url ).filename ):
122- _ensure_api_response (url , session = session )
126+ _ensure_api_response (url , session = session , headers = headers )
123127
124128 logger .debug ("Getting page %s" , redact_auth_from_url (url ))
125129
130+ logger .debug ("headers: %s" , str (headers ))
131+ if headers is None :
132+ headers = {}
126133 resp = session .get (
127134 url ,
128135 headers = {
@@ -147,6 +154,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
147154 # once per 10 minutes.
148155 # For more information, please see pypa/pip#5670.
149156 "Cache-Control" : "max-age=0" ,
157+ ** headers ,
150158 },
151159 )
152160 raise_for_status (resp )
@@ -225,7 +233,7 @@ def parse_links(page: IndexContent) -> Iterable[Link]:
225233 if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
226234 data = json .loads (page .content )
227235 for file in data .get ("files" , []):
228- link = Link .from_json (file , page .url )
236+ link = Link .from_json (file , page .url , page_content = page )
229237 if link is None :
230238 continue
231239 yield link
@@ -238,7 +246,9 @@ def parse_links(page: IndexContent) -> Iterable[Link]:
238246 url = page .url
239247 base_url = parser .base_url or url
240248 for anchor in parser .anchors :
241- link = Link .from_element (anchor , page_url = url , base_url = base_url )
249+ link = Link .from_element (
250+ anchor , page_url = url , base_url = base_url , page_content = page
251+ )
242252 if link is None :
243253 continue
244254 yield link
@@ -253,13 +263,17 @@ class IndexContent:
253263 :param cache_link_parsing: whether links parsed from this page's url
254264 should be cached. PyPI index urls should
255265 have this set to False, for example.
266+ :param etag: The ``ETag`` header from an HTTP request against ``url``.
267+ :param date: The ``Date`` header from an HTTP request against ``url``.
256268 """
257269
258270 content : bytes
259271 content_type : str
260272 encoding : str | None
261273 url : str
262274 cache_link_parsing : bool = True
275+ etag : str | None = None
276+ date : str | None = None
263277
264278 def __str__ (self ) -> str :
265279 return redact_auth_from_url (self .url )
@@ -304,7 +318,8 @@ def _handle_get_simple_fail(
304318
305319
306320def _make_index_content (
307- response : Response , cache_link_parsing : bool = True
321+ response : Response ,
322+ cache_link_parsing : bool = True ,
308323) -> IndexContent :
309324 encoding = _get_encoding_from_headers (response .headers )
310325 return IndexContent (
@@ -313,11 +328,15 @@ def _make_index_content(
313328 encoding = encoding ,
314329 url = response .url ,
315330 cache_link_parsing = cache_link_parsing ,
331+ etag = response .headers .get ("ETag" , None ),
332+ date = response .headers .get ("Date" , None ),
316333 )
317334
318335
319- def _get_index_content (link : Link , * , session : PipSession ) -> IndexContent | None :
320- url = link .url .split ("#" , 1 )[0 ]
336+ def _get_index_content (
337+ link : Link , * , session : PipSession , headers : dict [str , str ] | None = None
338+ ) -> IndexContent | None :
339+ url = link .url_without_fragment
321340
322341 # Check for VCS schemes that do not support lookup as web pages.
323342 vcs_scheme = _match_vcs_scheme (url )
@@ -344,7 +363,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non
344363 logger .debug (" file: URL is directory, getting %s" , url )
345364
346365 try :
347- resp = _get_simple_response (url , session = session )
366+ resp = _get_simple_response (url , session = session , headers = headers )
348367 except _NotHTTP :
349368 logger .warning (
350369 "Skipping page %s because it looks like an archive, and cannot "
@@ -360,9 +379,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> IndexContent | Non
360379 exc .request_desc ,
361380 exc .content_type ,
362381 )
363- except NetworkConnectionError as exc :
364- _handle_get_simple_fail (link , exc )
365- except RetryError as exc :
382+ except (NetworkConnectionError , RetryError ) as exc :
366383 _handle_get_simple_fail (link , exc )
367384 except SSLError as exc :
368385 reason = "There was a problem confirming the ssl certificate: "
@@ -436,11 +453,14 @@ def create(
436453 def find_links (self ) -> list [str ]:
437454 return self .search_scope .find_links
438455
439- def fetch_response (self , location : Link ) -> IndexContent | None :
456+ def fetch_response (
457+ self , location : Link , headers : dict [str , str ] | None = None
458+ ) -> IndexContent | None :
440459 """
441460 Fetch an HTML page containing package links.
442461 """
443- return _get_index_content (location , session = self .session )
462+ logger .debug ("headers: %s" , str (headers ))
463+ return _get_index_content (location , session = self .session , headers = headers )
444464
445465 def collect_sources (
446466 self ,
0 commit comments