11"""
2- The main purpose of this module is to expose LinkCollector.collect_links ().
2+ The main purpose of this module is to expose LinkCollector.collect_sources ().
33"""
44
55import cgi
6+ import collections
67import functools
78import html
89import itertools
910import logging
10- import mimetypes
1111import os
1212import re
1313import urllib .parse
1414import urllib .request
1515import xml .etree .ElementTree
16- from collections import OrderedDict
1716from optparse import Values
1817from typing import (
1918 Callable ,
2019 Iterable ,
2120 List ,
2221 MutableMapping ,
22+ NamedTuple ,
2323 Optional ,
2424 Sequence ,
25- Tuple ,
2625 Union ,
2726)
2827
3736from pip ._internal .network .utils import raise_for_status
3837from pip ._internal .utils .filetypes import is_archive_file
3938from pip ._internal .utils .misc import pairwise , redact_auth_from_url
40- from pip ._internal .utils .urls import path_to_url , url_to_path
41- from pip ._internal .vcs import is_url , vcs
39+ from pip ._internal .vcs import vcs
40+
41+ from .sources import CandidatesFromPage , LinkSource , build_source
4242
4343logger = logging .getLogger (__name__ )
4444
@@ -449,107 +449,9 @@ def _get_html_page(link, session=None):
449449 return None
450450
451451
452- def _remove_duplicate_links (links ):
453- # type: (Iterable[Link]) -> List[Link]
454- """
455- Return a list of links, with duplicates removed and ordering preserved.
456- """
457- # We preserve the ordering when removing duplicates because we can.
458- return list (OrderedDict .fromkeys (links ))
459-
460-
461- def group_locations (locations , expand_dir = False ):
462- # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
463- """
464- Divide a list of locations into two groups: "files" (archives) and "urls."
465-
466- :return: A pair of lists (files, urls).
467- """
468- files = []
469- urls = []
470-
471- # puts the url for the given file path into the appropriate list
472- def sort_path (path ):
473- # type: (str) -> None
474- url = path_to_url (path )
475- if mimetypes .guess_type (url , strict = False )[0 ] == 'text/html' :
476- urls .append (url )
477- else :
478- files .append (url )
479-
480- for url in locations :
481-
482- is_local_path = os .path .exists (url )
483- is_file_url = url .startswith ('file:' )
484-
485- if is_local_path or is_file_url :
486- if is_local_path :
487- path = url
488- else :
489- path = url_to_path (url )
490- if os .path .isdir (path ):
491- if expand_dir :
492- path = os .path .realpath (path )
493- for item in os .listdir (path ):
494- sort_path (os .path .join (path , item ))
495- elif is_file_url :
496- urls .append (url )
497- else :
498- logger .warning (
499- "Path '%s' is ignored: it is a directory." , path ,
500- )
501- elif os .path .isfile (path ):
502- sort_path (path )
503- else :
504- logger .warning (
505- "Url '%s' is ignored: it is neither a file "
506- "nor a directory." , url ,
507- )
508- elif is_url (url ):
509- # Only add url with clear scheme
510- urls .append (url )
511- else :
512- logger .warning (
513- "Url '%s' is ignored. It is either a non-existing "
514- "path or lacks a specific scheme." , url ,
515- )
516-
517- return files , urls
518-
519-
520- class CollectedLinks :
521-
522- """
523- Encapsulates the return value of a call to LinkCollector.collect_links().
524-
525- The return value includes both URLs to project pages containing package
526- links, as well as individual package Link objects collected from other
527- sources.
528-
529- This info is stored separately as:
530-
531- (1) links from the configured file locations,
532- (2) links from the configured find_links, and
533- (3) urls to HTML project pages, as described by the PEP 503 simple
534- repository API.
535- """
536-
537- def __init__ (
538- self ,
539- files , # type: List[Link]
540- find_links , # type: List[Link]
541- project_urls , # type: List[Link]
542- ):
543- # type: (...) -> None
544- """
545- :param files: Links from file locations.
546- :param find_links: Links from find_links.
547- :param project_urls: URLs to HTML project pages, as described by
548- the PEP 503 simple repository API.
549- """
550- self .files = files
551- self .find_links = find_links
552- self .project_urls = project_urls
452+ class CollectedSources (NamedTuple ):
453+ find_links : Sequence [Optional [LinkSource ]]
454+ index_urls : Sequence [Optional [LinkSource ]]
553455
554456
555457class LinkCollector :
@@ -558,7 +460,7 @@ class LinkCollector:
558460 Responsible for collecting Link objects from all configured locations,
559461 making network requests as needed.
560462
561- The class's main method is its collect_links () method.
463+ The class's main method is its collect_sources () method.
562464 """
563465
564466 def __init__ (
@@ -609,51 +511,46 @@ def fetch_page(self, location):
609511 """
610512 return _get_html_page (location , session = self .session )
611513
612- def collect_links (self , project_name ):
613- # type: (str) -> CollectedLinks
614- """Find all available links for the given project name.
615-
616- :return: All the Link objects (unfiltered), as a CollectedLinks object.
617- """
618- search_scope = self .search_scope
619- index_locations = search_scope .get_index_urls_locations (project_name )
620- index_file_loc , index_url_loc = group_locations (index_locations )
621- fl_file_loc , fl_url_loc = group_locations (
622- self .find_links , expand_dir = True ,
623- )
624-
625- file_links = [
626- Link (url ) for url in itertools .chain (index_file_loc , fl_file_loc )
627- ]
628-
629- # We trust every directly linked archive in find_links
630- find_link_links = [Link (url , '-f' ) for url in self .find_links ]
631-
632- # We trust every url that the user has given us whether it was given
633- # via --index-url or --find-links.
634- # We want to filter out anything that does not have a secure origin.
635- url_locations = [
636- link for link in itertools .chain (
637- # Mark PyPI indices as "cache_link_parsing == False" -- this
638- # will avoid caching the result of parsing the page for links.
639- (Link (url , cache_link_parsing = False ) for url in index_url_loc ),
640- (Link (url ) for url in fl_url_loc ),
514+ def collect_sources (
515+ self ,
516+ project_name : str ,
517+ candidates_from_page : CandidatesFromPage ,
518+ ) -> CollectedSources :
519+ # The OrderedDict calls deduplicate sources by URL.
520+ index_url_sources = collections .OrderedDict (
521+ build_source (
522+ loc ,
523+ candidates_from_page = candidates_from_page ,
524+ page_validator = self .session .is_secure_origin ,
525+ expand_dir = False ,
526+ cache_link_parsing = False ,
527+ )
528+ for loc in self .search_scope .get_index_urls_locations (project_name )
529+ ).values ()
530+ find_links_sources = collections .OrderedDict (
531+ build_source (
532+ loc ,
533+ candidates_from_page = candidates_from_page ,
534+ page_validator = self .session .is_secure_origin ,
535+ expand_dir = True ,
536+ cache_link_parsing = True ,
641537 )
642- if self .session .is_secure_origin (link )
643- ]
644-
645- url_locations = _remove_duplicate_links (url_locations )
646- lines = [
647- '{} location(s) to search for versions of {}:' .format (
648- len (url_locations ), project_name ,
649- ),
650- ]
651- for link in url_locations :
652- lines .append (f'* { link } ' )
653- logger .debug ('\n ' .join (lines ))
654-
655- return CollectedLinks (
656- files = file_links ,
657- find_links = find_link_links ,
658- project_urls = url_locations ,
538+ for loc in self .find_links
539+ ).values ()
540+
541+ if logger .isEnabledFor (logging .DEBUG ):
542+ lines = [
543+ f"* { s .link } "
544+ for s in itertools .chain (find_links_sources , index_url_sources )
545+ if s is not None and s .link is not None
546+ ]
547+ lines = [
548+ f"{ len (lines )} location(s) to search "
549+ f"for versions of { project_name } :"
550+ ] + lines
551+ logger .debug ("\n " .join (lines ))
552+
553+ return CollectedSources (
554+ find_links = list (find_links_sources ),
555+ index_urls = list (index_url_sources ),
659556 )
0 commit comments