99import json
1010import logging
1111import os
12- import re
1312import urllib .parse
1413import urllib .request
15- import xml .etree .ElementTree
1614from html .parser import HTMLParser
1715from optparse import Values
1816from typing import (
3937from pip ._internal .network .session import PipSession
4038from pip ._internal .network .utils import raise_for_status
4139from pip ._internal .utils .filetypes import is_archive_file
42- from pip ._internal .utils .misc import pairwise , redact_auth_from_url
40+ from pip ._internal .utils .misc import redact_auth_from_url
4341from pip ._internal .vcs import vcs
4442
4543from .sources import CandidatesFromPage , LinkSource , build_source
5149
5250logger = logging .getLogger (__name__ )
5351
54- HTMLElement = xml .etree .ElementTree .Element
5552ResponseHeaders = MutableMapping [str , str ]
5653
5754
@@ -191,94 +188,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
191188 return None
192189
193190
194- def _clean_url_path_part (part : str ) -> str :
195- """
196- Clean a "part" of a URL path (i.e. after splitting on "@" characters).
197- """
198- # We unquote prior to quoting to make sure nothing is double quoted.
199- return urllib .parse .quote (urllib .parse .unquote (part ))
200-
201-
202- def _clean_file_url_path (part : str ) -> str :
203- """
204- Clean the first part of a URL path that corresponds to a local
205- filesystem path (i.e. the first part after splitting on "@" characters).
206- """
207- # We unquote prior to quoting to make sure nothing is double quoted.
208- # Also, on Windows the path part might contain a drive letter which
209- # should not be quoted. On Linux where drive letters do not
210- # exist, the colon should be quoted. We rely on urllib.request
211- # to do the right thing here.
212- return urllib .request .pathname2url (urllib .request .url2pathname (part ))
213-
214-
215- # percent-encoded: /
216- _reserved_chars_re = re .compile ("(@|%2F)" , re .IGNORECASE )
217-
218-
219- def _clean_url_path (path : str , is_local_path : bool ) -> str :
220- """
221- Clean the path portion of a URL.
222- """
223- if is_local_path :
224- clean_func = _clean_file_url_path
225- else :
226- clean_func = _clean_url_path_part
227-
228- # Split on the reserved characters prior to cleaning so that
229- # revision strings in VCS URLs are properly preserved.
230- parts = _reserved_chars_re .split (path )
231-
232- cleaned_parts = []
233- for to_clean , reserved in pairwise (itertools .chain (parts , ["" ])):
234- cleaned_parts .append (clean_func (to_clean ))
235- # Normalize %xx escapes (e.g. %2f -> %2F)
236- cleaned_parts .append (reserved .upper ())
237-
238- return "" .join (cleaned_parts )
239-
240-
241- def _clean_link (url : str ) -> str :
242- """
243- Make sure a link is fully quoted.
244- For example, if ' ' occurs in the URL, it will be replaced with "%20",
245- and without double-quoting other characters.
246- """
247- # Split the URL into parts according to the general structure
248- # `scheme://netloc/path;parameters?query#fragment`.
249- result = urllib .parse .urlparse (url )
250- # If the netloc is empty, then the URL refers to a local filesystem path.
251- is_local_path = not result .netloc
252- path = _clean_url_path (result .path , is_local_path = is_local_path )
253- return urllib .parse .urlunparse (result ._replace (path = path ))
254-
255-
256- def _create_link_from_element (
257- element_attribs : Dict [str , Optional [str ]],
258- page_url : str ,
259- base_url : str ,
260- ) -> Optional [Link ]:
261- """
262- Convert an anchor element's attributes in a simple repository page to a Link.
263- """
264- href = element_attribs .get ("href" )
265- if not href :
266- return None
267-
268- url = _clean_link (urllib .parse .urljoin (base_url , href ))
269- pyrequire = element_attribs .get ("data-requires-python" )
270- yanked_reason = element_attribs .get ("data-yanked" )
271-
272- link = Link (
273- url ,
274- comes_from = page_url ,
275- requires_python = pyrequire ,
276- yanked_reason = yanked_reason ,
277- )
278-
279- return link
280-
281-
282191class CacheablePageContent :
283192 def __init__ (self , page : "IndexContent" ) -> None :
284193 assert page .cache_link_parsing
@@ -326,25 +235,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
326235 if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
327236 data = json .loads (page .content )
328237 for file in data .get ("files" , []):
329- file_url = file . get ( " url" )
330- if file_url is None :
238+ link = Link . from_json ( file , page . url )
239+ if link is None :
331240 continue
332-
333- # The Link.yanked_reason expects an empty string instead of a boolean.
334- yanked_reason = file .get ("yanked" )
335- if yanked_reason and not isinstance (yanked_reason , str ):
336- yanked_reason = ""
337- # The Link.yanked_reason expects None instead of False
338- elif not yanked_reason :
339- yanked_reason = None
340-
341- yield Link (
342- _clean_link (urllib .parse .urljoin (page .url , file_url )),
343- comes_from = page .url ,
344- requires_python = file .get ("requires-python" ),
345- yanked_reason = yanked_reason ,
346- hashes = file .get ("hashes" , {}),
347- )
241+ yield link
348242 return
349243
350244 parser = HTMLLinkParser (page .url )
@@ -354,11 +248,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
354248 url = page .url
355249 base_url = parser .base_url or url
356250 for anchor in parser .anchors :
357- link = _create_link_from_element (
358- anchor ,
359- page_url = url ,
360- base_url = base_url ,
361- )
251+ link = Link .from_element (anchor , page_url = url , base_url = base_url )
362252 if link is None :
363253 continue
364254 yield link
0 commit comments